261 files changed, 9006 insertions, 4231 deletions
diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index b9afc7a..75729b3 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -29,7 +29,7 @@ runtime_targets="${4}"
 runtime_targets_needs_reconfig="${5}"
 enable_cir="${6}"
 
-lit_args="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests"
+lit_args="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests --succinct"
 
 start-group "CMake"
 export PIP_BREAK_SYSTEM_PACKAGES=1
@@ -65,12 +65,12 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
 start-group "ninja"
 
 # Targets are not escaped as they are passed as separate arguments.
-ninja -C "${BUILD_DIR}" -k 0 ${targets}
+ninja -C "${BUILD_DIR}" -k 0 ${targets} |& tee ninja.log
 
 if [[ "${runtime_targets}" != "" ]]; then
   start-group "ninja Runtimes"
 
-  ninja -C "${BUILD_DIR}" ${runtime_targets}
+  ninja -C "${BUILD_DIR}" ${runtime_targets} |& tee ninja_runtimes.log
 fi
 
 # Compiling runtimes with just-built Clang and running their tests
@@ -85,7 +85,8 @@ if [[ "${runtime_targets_needs_reconfig}" != "" ]]; then
 
   start-group "ninja Runtimes C++26"
 
-  ninja -C "${BUILD_DIR}" ${runtime_targets_needs_reconfig}
+  ninja -C "${BUILD_DIR}" ${runtime_targets_needs_reconfig} \
+    |& tee ninja_runtimes_needs_reconfig1.log
 
   start-group "CMake Runtimes Clang Modules"
 
@@ -96,5 +97,6 @@ if [[ "${runtime_targets_needs_reconfig}" != "" ]]; then
 
   start-group "ninja Runtimes Clang Modules"
 
-  ninja -C "${BUILD_DIR}" ${runtime_targets_needs_reconfig}
+  ninja -C "${BUILD_DIR}" ${runtime_targets_needs_reconfig} \
+    |& tee ninja_runtimes_needs_reconfig2.log
 fi
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index 2e81e27..0f3a199 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -39,7 +39,7 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D LLVM_ENABLE_ASSERTIONS=ON \
       -D LLVM_BUILD_EXAMPLES=ON \
       -D COMPILER_RT_BUILD_LIBFUZZER=OFF \
-      -D LLVM_LIT_ARGS="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests" \
+      -D LLVM_LIT_ARGS="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests --succinct" \
       -D COMPILER_RT_BUILD_ORC=OFF \
       -D CMAKE_C_COMPILER_LAUNCHER=sccache \
       -D CMAKE_CXX_COMPILER_LAUNCHER=sccache \
@@ -51,4 +51,4 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
 start-group "ninja"
 
 # Targets are not escaped as they are passed as separate arguments.
-ninja -C "${BUILD_DIR}" -k 0 ${targets}
+ninja -C "${BUILD_DIR}" -k 0 ${targets} |& tee ninja.log
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index ea789ab..5c21c25 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -143,3 +143,6 @@ a3a007ad5fa20abc90ead4e1030b481bf109b4cf
 b7e332d3f59f567b1999fbcc660d7837cba8e406
 6056f942abe83b05406df8b04e95ec37a3d160b5
 906295b8a31c8dac5aa845864c0bca9f02f86184
+
+# [mlir][tensor][linalg] Move Pack/UnPack Ops to Linalg
+517800e37e8d3a4ee84214bef65e227612c2a98b
diff --git a/.github/workflows/mlir-spirv-tests.yml b/.github/workflows/mlir-spirv-tests.yml
new file mode 100644
index 0000000..48b6c69
--- /dev/null
+++ b/.github/workflows/mlir-spirv-tests.yml
@@ -0,0 +1,32 @@
+name: MLIR SPIR-V Tests
+
+permissions:
+  contents: read
+
+on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - 'mlir/include/mlir/Dialect/SPIRV/**'
+      - 'mlir/lib/Dialect/SPIRV/**'
+      - 'mlir/include/mlir/Target/SPIRV/**'
+      - 'mlir/lib/Target/SPIRV/**'
+      - 'mlir/test/Target/SPIRV/**'
+      - '.github/workflows/mlir-spirv-tests.yml'
+
+concurrency:
+  # Skip intermediate builds: always.
+  # Cancel intermediate builds: only if it is a pull request build.
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+
+jobs:
+  check_spirv:
+    if: github.repository_owner == 'llvm'
+    name: Test MLIR SPIR-V
+    uses: ./.github/workflows/llvm-project-tests.yml
+    with:
+      build_target: check-mlir
+      projects: mlir
+      extra_cmake_args: '-DLLVM_TARGETS_TO_BUILD="host" -DLLVM_INCLUDE_SPIRV_TOOLS_TESTS=ON'
+      os_list: '["ubuntu-24.04"]'
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 12dce30..dfcf075 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2724,6 +2724,7 @@ public:
   bool isHLSLAttributedResourceType() const;
   bool isHLSLInlineSpirvType() const;
   bool isHLSLResourceRecord() const;
+  bool isHLSLResourceRecordArray() const;
   bool isHLSLIntangibleType()
       const; // Any HLSL intangible type (builtin, array, class)
 
diff --git a/clang/include/clang/Basic/ABIVersions.def b/clang/include/clang/Basic/ABIVersions.def
new file mode 100644
index 0000000..f6524bc
--- /dev/null
+++ b/clang/include/clang/Basic/ABIVersions.def
@@ -0,0 +1,135 @@
+//===--- ABIVersions.def - Clang ABI Versions Database ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file enumerates Clang ABI versions.
+//
+//===----------------------------------------------------------------------===//
+//
+/// @file ABIVersions.def
+///
+/// In this file, each of the Clang ABI Versions is enumerated
+/// ABI_VER_MAJOR_MINOR, ABI_VER_MAJOR, or ABI_VER_LATEST macro.
+///
+/// ABI_VER_MAJOR is used when the minor version is 0 or can be omitted.
+///
+/// The first argument of ABI_VER_MAJOR_MINOR and ABI_VER_MAJOR is the major
+/// version.
+///
+/// The second argument of ABI_VER_MAJOR_MINOR is the minor version.
+///
+/// The first argument of ABI_VER_LATEST is an identifier `Latest`.
+
+#if defined(ABI_VER_MAJOR_MINOR) != defined(ABI_VER_MAJOR) ||                  \
+    defined(ABI_VER_MAJOR) != defined(ABI_VER_LATEST)
+#  error ABI_VER_MAJOR_MINOR, ABI_VER_MAJOR and ABI_VER_LATEST should be defined simultaneously
+#endif
+
+#ifndef ABI_VER_MAJOR_MINOR
+#  define ABI_VER_MAJOR_MINOR(Major, Minor)
+#endif
+
+#ifndef ABI_VER_MAJOR
+#  define ABI_VER_MAJOR(Major)
+#endif
+
+#ifndef ABI_VER_LATEST
+#  define ABI_VER_LATEST(Latest)
+#endif
+
+/// Attempt to be ABI-compatible with code generated by Clang 3.8.x
+/// (SVN r257626). This causes <1 x long long> to be passed in an integer
+/// register instead of an SSE register on x64_64.
+ABI_VER_MAJOR_MINOR(3, 8)
+
+/// Attempt to be ABI-compatible with code generated by Clang 4.0.x
+/// (SVN r291814). This causes move operations to be ignored when determining
+/// whether a class type can be passed or returned directly.
+ABI_VER_MAJOR(4)
+
+/// Attempt to be ABI-compatible with code generated by Clang 6.0.x
+/// (SVN r321711). This causes determination of whether a type is
+/// standard-layout to ignore collisions between empty base classes and between
+/// base classes and member subobjects, which affects whether we reuse base
+/// class tail padding in some ABIs.
+ABI_VER_MAJOR(6)
+
+/// Attempt to be ABI-compatible with code generated by Clang 7.0.x
+/// (SVN r338536). This causes alignof (C++) and _Alignof (C11) to be compatible
+/// with __alignof (i.e., return the preferred alignment) rather than returning
+/// the required alignment.
+ABI_VER_MAJOR(7)
+
+/// Attempt to be ABI-compatible with code generated by Clang 9.0.x
+/// (SVN r351319). This causes vectors of __int128 to be passed in memory
+/// instead of passing in multiple scalar registers on x86_64 on Linux and
+/// NetBSD.
+ABI_VER_MAJOR(9)
+
+/// Attempt to be ABI-compatible with code generated by Clang 11.0.x
+/// (git 2e10b7a39b93). This causes clang to pass unions with a 256-bit vector
+/// member on the stack instead of using registers, to not properly mangle
+/// substitutions for template names in some cases, and to mangle declaration
+/// template arguments without a cast to the parameter type even when that can
+/// lead to mangling collisions.
+ABI_VER_MAJOR(11)
+
+/// Attempt to be ABI-compatible with code generated by Clang 12.0.x
+/// (git 8e464dd76bef). This causes clang to mangle lambdas within global-scope
+/// inline variables incorrectly.
+ABI_VER_MAJOR(12)
+
+/// Attempt to be ABI-compatible with code generated by Clang 14.0.x.
+/// This causes clang to:
+///   - mangle dependent nested names incorrectly.
+///   - make trivial only those defaulted copy constructors with a
+///     parameter-type-list equivalent to the parameter-type-list of an implicit
+///     declaration.
+ABI_VER_MAJOR(14)
+
+/// Attempt to be ABI-compatible with code generated by Clang 15.0.x.
+/// This causes clang to:
+///   - Reverse the implementation for CWG692, CWG1395 and CWG1432.
+///   - pack non-POD members of packed structs.
+///   - consider classes with defaulted special member functions non-pod.
+ABI_VER_MAJOR(15)
+
+/// Attempt to be ABI-compatible with code generated by Clang 17.0.x.
+/// This causes clang to revert some fixes to its implementation of the Itanium
+/// name mangling scheme, with the consequence that overloaded function
+/// templates are mangled the same if they differ only by:
+///   - constraints
+///   - whether a non-type template parameter has a deduced type
+///   - the parameter list of a template template parameter
+ABI_VER_MAJOR(17)
+
+/// Attempt to be ABI-compatible with code generated by Clang 18.0.x.
+/// This causes clang to revert some fixes to the mangling of lambdas in the
+/// initializers of members of local classes.
+ABI_VER_MAJOR(18)
+
+/// Attempt to be ABI-compatible with code generated by Clang 19.0.x.
+/// This causes clang to:
+///   - Incorrectly mangle the 'base type' substitutions of the CXX construction
+///     vtable because it hasn't added 'type' as a substitution.
+///   - Skip mangling enclosing class templates of member-like friend function
+///     templates.
+///   - Ignore empty struct arguments in C++ mode for ARM, instead of passing
+///     them as if they had a size of 1 byte.
+ABI_VER_MAJOR(19)
+
+/// Attempt to be ABI-compatible with code generated by Clang 20.0.x.
+/// This causes clang to:
+///   - Incorrectly return C++ records in AVX registers on x86_64.
+ABI_VER_MAJOR(20)
+
+/// Conform to the underlying platform's C and C++ ABIs as closely as we can.
+ABI_VER_LATEST(Latest)
+
+#undef ABI_VER_MAJOR_MINOR
+#undef ABI_VER_MAJOR
+#undef ABI_VER_LATEST
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index cf23594..116341f 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13234,9 +13234,9 @@ def err_wasm_builtin_arg_must_match_table_element_type : Error <
   "%ordinal0 argument must match the element type of the WebAssembly table in the %ordinal1 argument">;
 def err_wasm_builtin_arg_must_be_integer_type : Error <
   "%ordinal0 argument must be an integer">;
-def err_wasm_builtin_test_fp_sig_cannot_include_reference_type
-    : Error<"not supported for "
-            "function pointers with a reference type %select{return "
+def err_wasm_builtin_test_fp_sig_cannot_include_struct_or_union
+    : Error<"not supported with the multivalue ABI for "
+            "function pointers with a struct/union as %select{return "
             "value|parameter}0">;
 
 // OpenACC diagnostics.
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 0407897..569584b 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -186,95 +186,10 @@ public:
 
   /// Clang versions with different platform ABI conformance.
   enum class ClangABI {
-    /// Attempt to be ABI-compatible with code generated by Clang 3.8.x
-    /// (SVN r257626). This causes <1 x long long> to be passed in an
-    /// integer register instead of an SSE register on x64_64.
-    Ver3_8,
-
-    /// Attempt to be ABI-compatible with code generated by Clang 4.0.x
-    /// (SVN r291814). This causes move operations to be ignored when
-    /// determining whether a class type can be passed or returned directly.
-    Ver4,
-
-    /// Attempt to be ABI-compatible with code generated by Clang 6.0.x
-    /// (SVN r321711). This causes determination of whether a type is
-    /// standard-layout to ignore collisions between empty base classes
-    /// and between base classes and member subobjects, which affects
-    /// whether we reuse base class tail padding in some ABIs.
-    Ver6,
-
-    /// Attempt to be ABI-compatible with code generated by Clang 7.0.x
-    /// (SVN r338536). This causes alignof (C++) and _Alignof (C11) to be
-    /// compatible with __alignof (i.e., return the preferred alignment)
-    /// rather than returning the required alignment.
-    Ver7,
-
-    /// Attempt to be ABI-compatible with code generated by Clang 9.0.x
-    /// (SVN r351319). This causes vectors of __int128 to be passed in memory
-    /// instead of passing in multiple scalar registers on x86_64 on Linux and
-    /// NetBSD.
-    Ver9,
-
-    /// Attempt to be ABI-compatible with code generated by Clang 11.0.x
-    /// (git 2e10b7a39b93). This causes clang to pass unions with a 256-bit
-    /// vector member on the stack instead of using registers, to not properly
-    /// mangle substitutions for template names in some cases, and to mangle
-    /// declaration template arguments without a cast to the parameter type
-    /// even when that can lead to mangling collisions.
-    Ver11,
-
-    /// Attempt to be ABI-compatible with code generated by Clang 12.0.x
-    /// (git 8e464dd76bef). This causes clang to mangle lambdas within
-    /// global-scope inline variables incorrectly.
-    Ver12,
-
-    /// Attempt to be ABI-compatible with code generated by Clang 14.0.x.
-    /// This causes clang to:
-    ///   - mangle dependent nested names incorrectly.
-    ///   - make trivial only those defaulted copy constructors with a
-    ///     parameter-type-list equivalent to the parameter-type-list of an
-    ///     implicit declaration.
-    Ver14,
-
-    /// Attempt to be ABI-compatible with code generated by Clang 15.0.x.
-    /// This causes clang to:
-    ///   - Reverse the implementation for DR692, DR1395 and DR1432.
-    ///   - pack non-POD members of packed structs.
-    ///   - consider classes with defaulted special member functions non-pod.
-    Ver15,
-
-    /// Attempt to be ABI-compatible with code generated by Clang 17.0.x.
-    /// This causes clang to revert some fixes to its implementation of the
-    /// Itanium name mangling scheme, with the consequence that overloaded
-    /// function templates are mangled the same if they differ only by:
-    ///   - constraints
-    ///   - whether a non-type template parameter has a deduced type
-    ///   - the parameter list of a template template parameter
-    Ver17,
-
-    /// Attempt to be ABI-compatible with code generated by Clang 18.0.x.
-    /// This causes clang to revert some fixes to the mangling of lambdas
-    /// in the initializers of members of local classes.
-    Ver18,
-
-    /// Attempt to be ABI-compatible with code generated by Clang 19.0.x.
-    /// This causes clang to:
-    ///   - Incorrectly mangle the 'base type' substitutions of the CXX
-    ///   construction vtable because it hasn't added 'type' as a substitution.
-    ///   - Skip mangling enclosing class templates of member-like friend
-    ///   function templates.
-    ///   - Ignore empty struct arguments in C++ mode for ARM, instead of
-    ///   passing them as if they had a size of 1 byte.
-    Ver19,
-
-    /// Attempt to be ABI-compatible with code generated by Clang 20.0.x.
-    /// This causes clang to:
-    ///   - Incorrectly return C++ records in AVX registers on x86_64.
-    Ver20,
-
-    /// Conform to the underlying platform's C and C++ ABIs as closely
-    /// as we can.
-    Latest
+#define ABI_VER_MAJOR_MINOR(Major, Minor) Ver##Major##_##Minor,
+#define ABI_VER_MAJOR(Major) Ver##Major,
+#define ABI_VER_LATEST(Latest) Latest
+#include "clang/Basic/ABIVersions.def"
   };
 
   enum class CoreFoundationABI {
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
index 588fb0d..3d34d77 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -516,4 +516,36 @@ def CIR_BitfieldInfoAttr : CIR_Attr<"BitfieldInfo", "bitfield_info"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// AddressPointAttr
+//===----------------------------------------------------------------------===//
+
+def CIR_AddressPointAttr : CIR_Attr<"AddressPoint", "address_point"> {
+  let summary = "Address point attribute";
+
+  let description = [{
+    Attribute specifying the address point within a C++ virtual table (vtable).
+
+    The `index` (vtable index) parameter identifies which vtable to use within a
+    vtable group, while the `offset` (address point index) specifies the offset
+    within that vtable where the address begins.
+
+    Example:
+    ```mlir
+    cir.global linkonce_odr @_ZTV1B = ...
+    ...
+    %3 = cir.vtable.address_point(@_ZTV1B,
+                                  address_point = <index = 0, offset = 2>)
+                                 : !cir.vptr
+    ```
+  }];
+
+  let parameters = (ins "int32_t":$index,
+                        "int32_t":$offset);
+
+  let assemblyFormat = [{
+    `<` struct($index, $offset) `>`
+  }];
+}
+
 #endif // CLANG_CIR_DIALECT_IR_CIRATTRS_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
index 3fdbf65..fdba4e4 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
@@ -40,6 +40,7 @@ def CIR_Dialect : Dialect {
     static llvm::StringRef getCalleeAttrName() { return "callee"; }
     static llvm::StringRef getNoThrowAttrName() { return "nothrow"; }
     static llvm::StringRef getSideEffectAttrName() { return "side_effect"; }
+    static llvm::StringRef getModuleLevelAsmAttrName() { return "cir.module_asm"; }
 
     void registerAttributes();
     void registerTypes();
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 72841a1..32813c1 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -1692,6 +1692,49 @@ def CIR_GetGlobalOp : CIR_Op<"get_global", [
 }
 
 //===----------------------------------------------------------------------===//
+// VTableAddrPointOp
+//===----------------------------------------------------------------------===//
+
+def CIR_VTableAddrPointOp : CIR_Op<"vtable.address_point", [
+  Pure, DeclareOpInterfaceMethods<SymbolUserOpInterface>
+]> {
+  let summary = "Get the vtable (global variable) address point";
+  let description = [{
+    The `vtable.address_point` operation retrieves the "effective" address
+    (address point) of a C++ virtual table. An object internal `__vptr`
+    gets initializated on top of the value returned by this operation.
+
+    `address_point.index` (vtable index) provides the appropriate vtable within
+    the vtable group (as specified by Itanium ABI), and `address_point.offset`
+    (address point index) the actual address point within that vtable.
+
+    The return type is always `!cir.vptr`.
+
+    Example:
+    ```mlir
+    cir.global linkonce_odr @_ZTV1B = ...
+    ...
+    %3 = cir.vtable.address_point(@_ZTV1B,
+              address_point = <index = 0, offset = 2>) : !cir.vptr
+    ```
+  }];
+
+  let arguments = (ins
+    FlatSymbolRefAttr:$name,
+    CIR_AddressPointAttr:$address_point
+  );
+
+  let results = (outs Res<CIR_VPtrType, "", []>:$addr);
+
+  let assemblyFormat = [{
+    `(`
+      $name `,` `address_point` `=` $address_point
+    `)`
+    `:` qualified(type($addr)) attr-dict
+  }];
+}
+
+//===----------------------------------------------------------------------===//
 // SetBitfieldOp
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 27dd181..fcc8ce7 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -199,6 +199,7 @@ struct MissingFeatures {
   static bool dataLayoutTypeAllocSize() { return false; }
   static bool deferredCXXGlobalInit() { return false; }
   static bool ehCleanupFlags() { return false; }
+  static bool ehCleanupScope() { return false; }
   static bool ehstackBranches() { return false; }
   static bool emitCheckedInBoundsGEP() { return false; }
   static bool emitCondLikelihoodViaExpectIntrinsic() { return false; }
@@ -253,6 +254,7 @@ struct MissingFeatures {
   static bool thunks() { return false; }
   static bool tryEmitAsConstant() { return false; }
   static bool typeChecks() { return false; }
+  static bool vtableInitializer() { return false; }
   static bool weakRefReference() { return false; }
   static bool writebacks() { return false; }
   static bool appleKext() { return false; }
diff --git a/clang/include/clang/Sema/SemaWasm.h b/clang/include/clang/Sema/SemaWasm.h
index 8c0639f..f825907 100644
--- a/clang/include/clang/Sema/SemaWasm.h
+++ b/clang/include/clang/Sema/SemaWasm.h
@@ -37,7 +37,8 @@ public:
   bool BuiltinWasmTableGrow(CallExpr *TheCall);
   bool BuiltinWasmTableFill(CallExpr *TheCall);
   bool BuiltinWasmTableCopy(CallExpr *TheCall);
-  bool BuiltinWasmTestFunctionPointerSignature(CallExpr *TheCall);
+  bool BuiltinWasmTestFunctionPointerSignature(const TargetInfo &TI,
+                                               CallExpr *TheCall);
 
   WebAssemblyImportNameAttr *
   mergeImportNameAttr(Decl *D, const WebAssemblyImportNameAttr &AL);
diff --git a/clang/lib/AST/ByteCode/DynamicAllocator.cpp b/clang/lib/AST/ByteCode/DynamicAllocator.cpp
index 9b8b664..bbef941 100644
--- a/clang/lib/AST/ByteCode/DynamicAllocator.cpp
+++ b/clang/lib/AST/ByteCode/DynamicAllocator.cpp
@@ -128,7 +128,7 @@ bool DynamicAllocator::deallocate(const Expr *Source,
     return false;
 
   auto &Site = It->second;
-  assert(Site.size() > 0);
+  assert(!Site.empty());
 
   // Find the Block to delete.
   auto AllocIt = llvm::find_if(Site.Allocations, [&](const Allocation &A) {
@@ -144,7 +144,7 @@ bool DynamicAllocator::deallocate(const Expr *Source,
   S.deallocate(B);
   Site.Allocations.erase(AllocIt);
 
-  if (Site.size() == 0)
+  if (Site.empty())
     AllocationSites.erase(It);
 
   return true;
diff --git a/clang/lib/AST/ByteCode/DynamicAllocator.h b/clang/lib/AST/ByteCode/DynamicAllocator.h
index cff09bf..cba5e34 100644
--- a/clang/lib/AST/ByteCode/DynamicAllocator.h
+++ b/clang/lib/AST/ByteCode/DynamicAllocator.h
@@ -55,6 +55,7 @@ private:
     }
 
     size_t size() const { return Allocations.size(); }
+    bool empty() const { return Allocations.empty(); }
   };
 
 public:
@@ -65,8 +66,6 @@ public:
 
   void cleanup();
 
-  unsigned getNumAllocations() const { return AllocationSites.size(); }
-
   /// Allocate ONE element of the given descriptor.
   Block *allocate(const Descriptor *D, unsigned EvalID, Form AllocForm);
   /// Allocate \p NumElements primitive elements of the given type.
@@ -96,6 +95,8 @@ public:
     return llvm::make_range(AllocationSites.begin(), AllocationSites.end());
   }
 
+  bool hasAllocations() const { return !AllocationSites.empty(); }
+
 private:
   llvm::DenseMap<const Expr *, AllocationSite> AllocationSites;
 
diff --git a/clang/lib/AST/ByteCode/EvalEmitter.cpp b/clang/lib/AST/ByteCode/EvalEmitter.cpp
index 976b7c0..9ed61c7 100644
--- a/clang/lib/AST/ByteCode/EvalEmitter.cpp
+++ b/clang/lib/AST/ByteCode/EvalEmitter.cpp
@@ -292,7 +292,7 @@ bool EvalEmitter::emitGetLocal(uint32_t I, const SourceInfo &Info) {
 
   Block *B = getLocal(I);
 
-  if (!CheckLocalLoad(S, OpPC, Pointer(B)))
+  if (!CheckLocalLoad(S, OpPC, B))
     return false;
 
   S.Stk.push<T>(*reinterpret_cast<T *>(B->data()));
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index eb4e480..bc14bd3d 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -211,25 +211,26 @@ static void diagnoseNonConstVariable(InterpState &S, CodePtr OpPC,
   S.Note(VD->getLocation(), diag::note_declared_at);
 }
 
-static bool CheckTemporary(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+static bool CheckTemporary(InterpState &S, CodePtr OpPC, const Block *B,
                            AccessKinds AK) {
-  if (auto ID = Ptr.getDeclID()) {
-    if (!Ptr.isStaticTemporary())
+  if (B->getDeclID()) {
+    if (!(B->isStatic() && B->isTemporary()))
       return true;
 
     const auto *MTE = dyn_cast_if_present<MaterializeTemporaryExpr>(
-        Ptr.getDeclDesc()->asExpr());
+        B->getDescriptor()->asExpr());
     if (!MTE)
       return true;
 
     // FIXME(perf): Since we do this check on every Load from a static
     // temporary, it might make sense to cache the value of the
     // isUsableInConstantExpressions call.
-    if (!MTE->isUsableInConstantExpressions(S.getASTContext()) &&
-        Ptr.block()->getEvalID() != S.Ctx.getEvalID()) {
+    if (B->getEvalID() != S.Ctx.getEvalID() &&
+        !MTE->isUsableInConstantExpressions(S.getASTContext())) {
       const SourceInfo &E = S.Current->getSource(OpPC);
       S.FFDiag(E, diag::note_constexpr_access_static_temporary, 1) << AK;
-      S.Note(Ptr.getDeclLoc(), diag::note_constexpr_temporary_here);
+      S.Note(B->getDescriptor()->getLocation(),
+             diag::note_constexpr_temporary_here);
       return false;
     }
   }
@@ -658,17 +659,19 @@ static bool CheckVolatile(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
   return false;
 }
 
-bool CheckInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
-                      AccessKinds AK) {
+bool DiagnoseUninitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+                           AccessKinds AK) {
   assert(Ptr.isLive());
+  assert(!Ptr.isInitialized());
+  return DiagnoseUninitialized(S, OpPC, Ptr.isExtern(), Ptr.getDeclDesc(), AK);
+}
 
-  if (Ptr.isInitialized())
-    return true;
-
-  if (Ptr.isExtern() && S.checkingPotentialConstantExpression())
+bool DiagnoseUninitialized(InterpState &S, CodePtr OpPC, bool Extern,
+                           const Descriptor *Desc, AccessKinds AK) {
+  if (Extern && S.checkingPotentialConstantExpression())
     return false;
 
-  if (const auto *VD = Ptr.getDeclDesc()->asVarDecl();
+  if (const auto *VD = Desc->asVarDecl();
       VD && (VD->isConstexpr() || VD->hasGlobalStorage())) {
 
     if (VD == S.EvaluatingDecl &&
@@ -703,9 +706,9 @@ bool CheckInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
   return false;
 }
 
-static bool CheckLifetime(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+static bool CheckLifetime(InterpState &S, CodePtr OpPC, Lifetime LT,
                           AccessKinds AK) {
-  if (Ptr.getLifetime() == Lifetime::Started)
+  if (LT == Lifetime::Started)
     return true;
 
   if (!S.checkingPotentialConstantExpression()) {
@@ -715,11 +718,11 @@ static bool CheckLifetime(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
   return false;
 }
 
-static bool CheckWeak(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
-  if (!Ptr.isWeak())
+static bool CheckWeak(InterpState &S, CodePtr OpPC, const Block *B) {
+  if (!B->isWeak())
     return true;
 
-  const auto *VD = Ptr.getDeclDesc()->asVarDecl();
+  const auto *VD = B->getDescriptor()->asVarDecl();
   assert(VD);
   S.FFDiag(S.Current->getLocation(OpPC), diag::note_constexpr_var_init_weak)
       << VD;
@@ -732,32 +735,56 @@ static bool CheckWeak(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
 // ones removed that are impossible on primitive global values.
 // For example, since those can't be members of structs, they also can't
 // be mutable.
-bool CheckGlobalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
-  if (!CheckExtern(S, OpPC, Ptr))
-    return false;
-  if (!CheckConstant(S, OpPC, Ptr))
+bool CheckGlobalLoad(InterpState &S, CodePtr OpPC, const Block *B) {
+  const auto &Desc =
+      *reinterpret_cast<const GlobalInlineDescriptor *>(B->rawData());
+  if (!CheckExtern(S, OpPC, Pointer(const_cast<Block *>(B))))
     return false;
-  if (!CheckDummy(S, OpPC, Ptr, AK_Read))
+  if (!CheckConstant(S, OpPC, B->getDescriptor()))
     return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Read))
+  if (!CheckDummy(S, OpPC, B, AK_Read))
     return false;
-  if (!CheckTemporary(S, OpPC, Ptr, AK_Read))
+  if (Desc.InitState != GlobalInitState::Initialized)
+    return DiagnoseUninitialized(S, OpPC, B->isExtern(), B->getDescriptor(),
+                                 AK_Read);
+  if (!CheckTemporary(S, OpPC, B, AK_Read))
     return false;
-  if (!CheckWeak(S, OpPC, Ptr))
+  if (!CheckWeak(S, OpPC, B))
     return false;
-  if (!CheckVolatile(S, OpPC, Ptr, AK_Read))
+  if (B->getDescriptor()->IsVolatile) {
+    if (!S.getLangOpts().CPlusPlus)
+      return Invalid(S, OpPC);
+
+    const ValueDecl *D = B->getDescriptor()->asValueDecl();
+    S.FFDiag(S.Current->getLocation(OpPC),
+             diag::note_constexpr_access_volatile_obj, 1)
+        << AK_Read << 1 << D;
+    S.Note(D->getLocation(), diag::note_constexpr_volatile_here) << 1;
     return false;
+  }
   return true;
 }
 
 // Similarly, for local loads.
-bool CheckLocalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
-  if (!CheckLifetime(S, OpPC, Ptr, AK_Read))
-    return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Read))
-    return false;
-  if (!CheckVolatile(S, OpPC, Ptr, AK_Read))
+bool CheckLocalLoad(InterpState &S, CodePtr OpPC, const Block *B) {
+  assert(!B->isExtern());
+  const auto &Desc = *reinterpret_cast<const InlineDescriptor *>(B->rawData());
+  if (!CheckLifetime(S, OpPC, Desc.LifeState, AK_Read))
+    return false;
+  if (!Desc.IsInitialized)
+    return DiagnoseUninitialized(S, OpPC, /*Extern=*/false, B->getDescriptor(),
+                                 AK_Read);
+  if (B->getDescriptor()->IsVolatile) {
+    if (!S.getLangOpts().CPlusPlus)
+      return Invalid(S, OpPC);
+
+    const ValueDecl *D = B->getDescriptor()->asValueDecl();
+    S.FFDiag(S.Current->getLocation(OpPC),
+             diag::note_constexpr_access_volatile_obj, 1)
+        << AK_Read << 1 << D;
+    S.Note(D->getLocation(), diag::note_constexpr_volatile_here) << 1;
     return false;
+  }
   return true;
 }
 
@@ -769,19 +796,19 @@ bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
     return false;
   if (!CheckConstant(S, OpPC, Ptr))
     return false;
-  if (!CheckDummy(S, OpPC, Ptr, AK))
+  if (Ptr.isBlockPointer() && !CheckDummy(S, OpPC, Ptr.block(), AK))
     return false;
   if (!CheckRange(S, OpPC, Ptr, AK))
     return false;
   if (!CheckActive(S, OpPC, Ptr, AK))
     return false;
-  if (!CheckLifetime(S, OpPC, Ptr, AK))
+  if (!CheckLifetime(S, OpPC, Ptr.getLifetime(), AK))
     return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK))
+  if (!Ptr.isInitialized())
+    return DiagnoseUninitialized(S, OpPC, Ptr, AK);
+  if (Ptr.isBlockPointer() && !CheckTemporary(S, OpPC, Ptr.block(), AK))
     return false;
-  if (!CheckTemporary(S, OpPC, Ptr, AK))
-    return false;
-  if (!CheckWeak(S, OpPC, Ptr))
+  if (Ptr.isBlockPointer() && !CheckWeak(S, OpPC, Ptr.block()))
     return false;
   if (!CheckMutable(S, OpPC, Ptr))
     return false;
@@ -798,7 +825,7 @@ bool CheckFinalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   if (!CheckConstant(S, OpPC, Ptr))
     return false;
 
-  if (!CheckDummy(S, OpPC, Ptr, AK_Read))
+  if (Ptr.isBlockPointer() && !CheckDummy(S, OpPC, Ptr.block(), AK_Read))
     return false;
   if (!CheckExtern(S, OpPC, Ptr))
     return false;
@@ -806,13 +833,13 @@ bool CheckFinalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
     return false;
   if (!CheckActive(S, OpPC, Ptr, AK_Read))
     return false;
-  if (!CheckLifetime(S, OpPC, Ptr, AK_Read))
-    return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Read))
+  if (!CheckLifetime(S, OpPC, Ptr.getLifetime(), AK_Read))
     return false;
-  if (!CheckTemporary(S, OpPC, Ptr, AK_Read))
+  if (!Ptr.isInitialized())
+    return DiagnoseUninitialized(S, OpPC, Ptr, AK_Read);
+  if (Ptr.isBlockPointer() && !CheckTemporary(S, OpPC, Ptr.block(), AK_Read))
     return false;
-  if (!CheckWeak(S, OpPC, Ptr))
+  if (Ptr.isBlockPointer() && !CheckWeak(S, OpPC, Ptr.block()))
     return false;
   if (!CheckMutable(S, OpPC, Ptr))
     return false;
@@ -822,9 +849,9 @@ bool CheckFinalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
 bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   if (!CheckLive(S, OpPC, Ptr, AK_Assign))
     return false;
-  if (!CheckDummy(S, OpPC, Ptr, AK_Assign))
+  if (Ptr.isBlockPointer() && !CheckDummy(S, OpPC, Ptr.block(), AK_Assign))
     return false;
-  if (!CheckLifetime(S, OpPC, Ptr, AK_Assign))
+  if (!CheckLifetime(S, OpPC, Ptr.getLifetime(), AK_Assign))
     return false;
   if (!CheckExtern(S, OpPC, Ptr))
     return false;
@@ -1098,12 +1125,11 @@ bool CheckDeclRef(InterpState &S, CodePtr OpPC, const DeclRefExpr *DR) {
   return diagnoseUnknownDecl(S, OpPC, D);
 }
 
-bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
-                AccessKinds AK) {
-  if (!Ptr.isDummy())
+bool CheckDummy(InterpState &S, CodePtr OpPC, const Block *B, AccessKinds AK) {
+  const Descriptor *Desc = B->getDescriptor();
+  if (!Desc->isDummy())
     return true;
 
-  const Descriptor *Desc = Ptr.getDeclDesc();
   const ValueDecl *D = Desc->asValueDecl();
   if (!D)
     return false;
@@ -1426,7 +1452,7 @@ static bool checkConstructor(InterpState &S, CodePtr OpPC, const Function *Func,
 bool CheckDestructor(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   if (!CheckLive(S, OpPC, Ptr, AK_Destroy))
     return false;
-  if (!CheckTemporary(S, OpPC, Ptr, AK_Destroy))
+  if (!CheckTemporary(S, OpPC, Ptr.block(), AK_Destroy))
     return false;
   if (!CheckRange(S, OpPC, Ptr, AK_Destroy))
     return false;
@@ -1620,8 +1646,17 @@ bool CallVirt(InterpState &S, CodePtr OpPC, const Function *Func,
 
   const auto *StaticDecl = cast<CXXRecordDecl>(Func->getParentDecl());
   const auto *InitialFunction = cast<CXXMethodDecl>(Callee);
-  const CXXMethodDecl *Overrider = S.getContext().getOverridingFunction(
-      DynamicDecl, StaticDecl, InitialFunction);
+  const CXXMethodDecl *Overrider;
+
+  if (StaticDecl != DynamicDecl) {
+    if (!DynamicDecl->isDerivedFrom(StaticDecl))
+      return false;
+    Overrider = S.getContext().getOverridingFunction(DynamicDecl, StaticDecl,
+                                                     InitialFunction);
+
+  } else {
+    Overrider = InitialFunction;
+  }
 
   if (Overrider != InitialFunction) {
     // DR1872: An instantiated virtual constexpr function can't be called in a
@@ -1749,7 +1784,7 @@ static void startLifetimeRecurse(const Pointer &Ptr) {
 
 bool StartLifetime(InterpState &S, CodePtr OpPC) {
   const auto &Ptr = S.Stk.peek<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr, AK_Destroy))
+  if (Ptr.isBlockPointer() && !CheckDummy(S, OpPC, Ptr.block(), AK_Destroy))
     return false;
   startLifetimeRecurse(Ptr.narrow());
   return true;
@@ -1780,7 +1815,7 @@ static void endLifetimeRecurse(const Pointer &Ptr) {
 /// Ends the lifetime of the peek'd pointer.
 bool EndLifetime(InterpState &S, CodePtr OpPC) {
   const auto &Ptr = S.Stk.peek<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr, AK_Destroy))
+  if (Ptr.isBlockPointer() && !CheckDummy(S, OpPC, Ptr.block(), AK_Destroy))
     return false;
   endLifetimeRecurse(Ptr.narrow());
   return true;
@@ -1789,7 +1824,7 @@ bool EndLifetime(InterpState &S, CodePtr OpPC) {
 /// Ends the lifetime of the pop'd pointer.
 bool EndLifetimePop(InterpState &S, CodePtr OpPC) {
   const auto &Ptr = S.Stk.pop<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr, AK_Destroy))
+  if (Ptr.isBlockPointer() && !CheckDummy(S, OpPC, Ptr.block(), AK_Destroy))
     return false;
   endLifetimeRecurse(Ptr.narrow());
   return true;
@@ -1804,16 +1839,16 @@ bool CheckNewTypeMismatch(InterpState &S, CodePtr OpPC, const Expr *E,
 
   // Similar to CheckStore(), but with the additional CheckTemporary() call and
   // the AccessKinds are different.
-  if (!CheckTemporary(S, OpPC, Ptr, AK_Construct))
+  if (!CheckTemporary(S, OpPC, Ptr.block(), AK_Construct))
     return false;
   if (!CheckLive(S, OpPC, Ptr, AK_Construct))
     return false;
-  if (!CheckDummy(S, OpPC, Ptr, AK_Construct))
+  if (!CheckDummy(S, OpPC, Ptr.block(), AK_Construct))
     return false;
 
   // CheckLifetime for this and all base pointers.
   for (Pointer P = Ptr;;) {
-    if (!CheckLifetime(S, OpPC, P, AK_Construct))
+    if (!CheckLifetime(S, OpPC, P.getLifetime(), AK_Construct))
       return false;
 
     if (P.isRoot())
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 8a28106..0d3f492 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -51,8 +51,7 @@ bool CheckLive(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                AccessKinds AK);
 
 /// Checks if a pointer is a dummy pointer.
-bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
-                AccessKinds AK);
+bool CheckDummy(InterpState &S, CodePtr OpPC, const Block *B, AccessKinds AK);
 
 /// Checks if a pointer is null.
 bool CheckNull(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
@@ -89,11 +88,14 @@ bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                AccessKinds AK = AK_Read);
 bool CheckFinalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
 
-bool CheckInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
-                      AccessKinds AK);
+bool DiagnoseUninitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+                           AccessKinds AK);
+bool DiagnoseUninitialized(InterpState &S, CodePtr OpPC, bool Extern,
+                           const Descriptor *Desc, AccessKinds AK);
+
 /// Checks a direct load of a primitive value from a global or local variable.
-bool CheckGlobalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
-bool CheckLocalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
+bool CheckGlobalLoad(InterpState &S, CodePtr OpPC, const Block *B);
+bool CheckLocalLoad(InterpState &S, CodePtr OpPC, const Block *B);
 
 /// Checks if a value can be stored in a block.
 bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
@@ -1351,10 +1353,10 @@ inline bool ConstFloat(InterpState &S, CodePtr OpPC, const Floating &F) {
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool GetLocal(InterpState &S, CodePtr OpPC, uint32_t I) {
-  const Pointer &Ptr = S.Current->getLocalPointer(I);
-  if (!CheckLocalLoad(S, OpPC, Ptr))
+  const Block *B = S.Current->getLocalBlock(I);
+  if (!CheckLocalLoad(S, OpPC, B))
     return false;
-  S.Stk.push<T>(Ptr.deref<T>());
+  S.Stk.push<T>(B->deref<T>());
   return true;
 }
 
@@ -1465,22 +1467,26 @@ bool SetThisField(InterpState &S, CodePtr OpPC, uint32_t I) {
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool GetGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
-  const Pointer &Ptr = S.P.getPtrGlobal(I);
+  const Block *B = S.P.getGlobal(I);
 
-  if (!CheckGlobalLoad(S, OpPC, Ptr))
+  if (!CheckGlobalLoad(S, OpPC, B))
     return false;
 
-  S.Stk.push<T>(Ptr.deref<T>());
+  S.Stk.push<T>(B->deref<T>());
   return true;
 }
 
 /// Same as GetGlobal, but without the checks.
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool GetGlobalUnchecked(InterpState &S, CodePtr OpPC, uint32_t I) {
-  const Pointer &Ptr = S.P.getPtrGlobal(I);
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Read))
-    return false;
-  S.Stk.push<T>(Ptr.deref<T>());
+  const Block *B = S.P.getGlobal(I);
+  const auto &Desc =
+      *reinterpret_cast<const GlobalInlineDescriptor *>(B->rawData());
+  if (Desc.InitState != GlobalInitState::Initialized)
+    return DiagnoseUninitialized(S, OpPC, B->isExtern(), B->getDescriptor(),
+                                 AK_Read);
+
+  S.Stk.push<T>(B->deref<T>());
   return true;
 }
 
@@ -2351,8 +2357,8 @@ static inline bool IncDecPtrHelper(InterpState &S, CodePtr OpPC,
 static inline bool IncPtr(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Increment))
-    return false;
+  if (!Ptr.isInitialized())
+    return DiagnoseUninitialized(S, OpPC, Ptr, AK_Increment);
 
   return IncDecPtrHelper<ArithOp::Add>(S, OpPC, Ptr);
 }
@@ -2360,8 +2366,8 @@ static inline bool IncPtr(InterpState &S, CodePtr OpPC) {
 static inline bool DecPtr(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Decrement))
-    return false;
+  if (!Ptr.isInitialized())
+    return DiagnoseUninitialized(S, OpPC, Ptr, AK_Decrement);
 
   return IncDecPtrHelper<ArithOp::Sub>(S, OpPC, Ptr);
 }
@@ -3195,6 +3201,9 @@ inline bool GetMemberPtr(InterpState &S, CodePtr OpPC, const ValueDecl *D) {
 inline bool GetMemberPtrBase(InterpState &S, CodePtr OpPC) {
   const auto &MP = S.Stk.pop<MemberPointer>();
 
+  if (!MP.isBaseCastPossible())
+    return false;
+
   S.Stk.push<Pointer>(MP.getBase());
   return true;
 }
diff --git a/clang/lib/AST/ByteCode/InterpBlock.h b/clang/lib/AST/ByteCode/InterpBlock.h
index 5162223..07194d6 100644
--- a/clang/lib/AST/ByteCode/InterpBlock.h
+++ b/clang/lib/AST/ByteCode/InterpBlock.h
@@ -103,6 +103,10 @@ public:
     return reinterpret_cast<const std::byte *>(this) + sizeof(Block);
   }
 
+  template <typename T> T deref() const {
+    return *reinterpret_cast<const T *>(data());
+  }
+
   /// Invokes the constructor.
   void invokeCtor() {
     assert(!IsInitialized);
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index f908d02..c835bd4 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -276,7 +276,7 @@ static bool interp__builtin_strlen(InterpState &S, CodePtr OpPC,
   if (!CheckLive(S, OpPC, StrPtr, AK_Read))
     return false;
 
-  if (!CheckDummy(S, OpPC, StrPtr, AK_Read))
+  if (!CheckDummy(S, OpPC, StrPtr.block(), AK_Read))
     return false;
 
   assert(StrPtr.getFieldDesc()->isPrimitiveArray());
@@ -2232,7 +2232,7 @@ static bool interp__builtin_is_within_lifetime(InterpState &S, CodePtr OpPC,
       return false;
     if (!CheckMutable(S, OpPC, Ptr))
       return false;
-    if (!CheckDummy(S, OpPC, Ptr, AK_Read))
+    if (!CheckDummy(S, OpPC, Ptr.block(), AK_Read))
       return false;
   }
 
diff --git a/clang/lib/AST/ByteCode/InterpFrame.cpp b/clang/lib/AST/ByteCode/InterpFrame.cpp
index 9342192..f2eac86 100644
--- a/clang/lib/AST/ByteCode/InterpFrame.cpp
+++ b/clang/lib/AST/ByteCode/InterpFrame.cpp
@@ -231,6 +231,10 @@ Pointer InterpFrame::getLocalPointer(unsigned Offset) const {
   return Pointer(localBlock(Offset));
 }
 
+Block *InterpFrame::getLocalBlock(unsigned Offset) const {
+  return localBlock(Offset);
+}
+
 Pointer InterpFrame::getParamPointer(unsigned Off) {
   // Return the block if it was created previously.
   if (auto Pt = Params.find(Off); Pt != Params.end())
diff --git a/clang/lib/AST/ByteCode/InterpFrame.h b/clang/lib/AST/ByteCode/InterpFrame.h
index cfebe93..4be5391 100644
--- a/clang/lib/AST/ByteCode/InterpFrame.h
+++ b/clang/lib/AST/ByteCode/InterpFrame.h
@@ -86,6 +86,7 @@ public:
 
   /// Returns a pointer to a local variables.
   Pointer getLocalPointer(unsigned Offset) const;
+  Block *getLocalBlock(unsigned Offset) const;
 
   /// Returns the value of an argument.
   template <typename T> const T &getParam(unsigned Offset) const {
diff --git a/clang/lib/AST/ByteCode/InterpState.cpp b/clang/lib/AST/ByteCode/InterpState.cpp
index a06b125..49c9b54 100644
--- a/clang/lib/AST/ByteCode/InterpState.cpp
+++ b/clang/lib/AST/ByteCode/InterpState.cpp
@@ -76,9 +76,6 @@ bool InterpState::reportOverflow(const Expr *E, const llvm::APSInt &Value) {
 
 void InterpState::deallocate(Block *B) {
   assert(B);
-  const Descriptor *Desc = B->getDescriptor();
-  assert(Desc);
-
   // The block might have a pointer saved in a field in its data
   // that points to the block itself. We call the dtor first,
   // which will destroy all the data but leave InlineDescriptors
@@ -95,7 +92,7 @@ void InterpState::deallocate(Block *B) {
     auto *D = new (Memory) DeadBlock(DeadBlocks, B);
     // Since the block doesn't hold any actual data anymore, we can just
     // memcpy() everything over.
-    std::memcpy(D->rawData(), B->rawData(), Desc->getAllocSize());
+    std::memcpy(D->rawData(), B->rawData(), B->getSize());
     D->B.IsInitialized = B->IsInitialized;
 
     // We moved the contents over to the DeadBlock.
@@ -104,15 +101,14 @@ void InterpState::deallocate(Block *B) {
 }
 
 bool InterpState::maybeDiagnoseDanglingAllocations() {
-  bool NoAllocationsLeft = (Alloc.getNumAllocations() == 0);
+  bool NoAllocationsLeft = !Alloc.hasAllocations();
 
   if (!checkingPotentialConstantExpression()) {
-    for (const auto &It : Alloc.allocation_sites()) {
-      assert(It.second.size() > 0);
+    for (const auto &[Source, Site] : Alloc.allocation_sites()) {
+      assert(!Site.empty());
 
-      const Expr *Source = It.first;
       CCEDiag(Source->getExprLoc(), diag::note_constexpr_memory_leak)
-          << (It.second.size() - 1) << Source->getSourceRange();
+          << (Site.size() - 1) << Source->getSourceRange();
     }
   }
   // Keep evaluating before C++20, since the CXXNewExpr wasn't valid there
diff --git a/clang/lib/AST/ByteCode/MemberPointer.h b/clang/lib/AST/ByteCode/MemberPointer.h
index b17ce25..8dd75ca 100644
--- a/clang/lib/AST/ByteCode/MemberPointer.h
+++ b/clang/lib/AST/ByteCode/MemberPointer.h
@@ -51,6 +51,12 @@ public:
 
   FunctionPointer toFunctionPointer(const Context &Ctx) const;
 
+  bool isBaseCastPossible() const {
+    if (PtrOffset < 0)
+      return true;
+    return static_cast<uint64_t>(PtrOffset) <= Base.getByteOffset();
+  }
+
   Pointer getBase() const {
     if (PtrOffset < 0)
       return Base.atField(-PtrOffset);
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 141edc8..03d7413 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -5246,6 +5246,15 @@ bool Type::isHLSLResourceRecord() const {
   return HLSLAttributedResourceType::findHandleTypeOnResource(this) != nullptr;
 }
 
+bool Type::isHLSLResourceRecordArray() const {
+  const Type *Ty = getUnqualifiedDesugaredType();
+  if (!Ty->isArrayType())
+    return false;
+  while (isa<ConstantArrayType>(Ty))
+    Ty = Ty->getArrayElementTypeNoTypeQual();
+  return Ty->isHLSLResourceRecord();
+}
+
 bool Type::isHLSLIntangibleType() const {
   const Type *Ty = getUnqualifiedDesugaredType();
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
index be21ce9..b8663eb 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
@@ -16,6 +16,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CIRGenCleanup.h"
 #include "CIRGenFunction.h"
 
 #include "clang/CIR/MissingFeatures.h"
@@ -33,6 +34,52 @@ using namespace clang::CIRGen;
 
 void EHScopeStack::Cleanup::anchor() {}
 
+/// Push an entry of the given size onto this protected-scope stack.
+char *EHScopeStack::allocate(size_t size) {
+  size = llvm::alignTo(size, ScopeStackAlignment);
+  if (!startOfBuffer) {
+    unsigned capacity = llvm::PowerOf2Ceil(std::max(size, 1024ul));
+    startOfBuffer = std::make_unique<char[]>(capacity);
+    startOfData = endOfBuffer = startOfBuffer.get() + capacity;
+  } else if (static_cast<size_t>(startOfData - startOfBuffer.get()) < size) {
+    unsigned currentCapacity = endOfBuffer - startOfBuffer.get();
+    unsigned usedCapacity =
+        currentCapacity - (startOfData - startOfBuffer.get());
+    unsigned requiredCapacity = usedCapacity + size;
+    // We know from the 'else if' condition that requiredCapacity is greater
+    // than currentCapacity.
+    unsigned newCapacity = llvm::PowerOf2Ceil(requiredCapacity);
+
+    std::unique_ptr<char[]> newStartOfBuffer =
+        std::make_unique<char[]>(newCapacity);
+    char *newEndOfBuffer = newStartOfBuffer.get() + newCapacity;
+    char *newStartOfData = newEndOfBuffer - usedCapacity;
+    memcpy(newStartOfData, startOfData, usedCapacity);
+    startOfBuffer.swap(newStartOfBuffer);
+    endOfBuffer = newEndOfBuffer;
+    startOfData = newStartOfData;
+  }
+
+  assert(startOfBuffer.get() + size <= startOfData);
+  startOfData -= size;
+  return startOfData;
+}
+
+void EHScopeStack::deallocate(size_t size) {
+  startOfData += llvm::alignTo(size, ScopeStackAlignment);
+}
+
+void *EHScopeStack::pushCleanup(CleanupKind kind, size_t size) {
+  char *buffer = allocate(size);
+
+  // When the full implementation is upstreamed, this will allocate
+  // extra memory for and construct a wrapper object that is used to
+  // manage the cleanup generation.
+  assert(!cir::MissingFeatures::ehCleanupScope());
+
+  return buffer;
+}
+
 static mlir::Block *getCurCleanupBlock(CIRGenFunction &cgf) {
   mlir::OpBuilder::InsertionGuard guard(cgf.getBuilder());
   mlir::Block *cleanup =
@@ -44,26 +91,34 @@ static mlir::Block *getCurCleanupBlock(CIRGenFunction &cgf) {
 /// current insertion point is threaded through the cleanup, as are
 /// any branch fixups on the cleanup.
 void CIRGenFunction::popCleanupBlock() {
-  assert(!ehStack.cleanupStack.empty() && "cleanup stack is empty!");
+  assert(!ehStack.empty() && "cleanup stack is empty!");
+
+  // The memory for the cleanup continues to be owned by the EHScopeStack
+  // allocator, so we just destroy the object rather than attempting to
+  // free it.
+  EHScopeStack::Cleanup &cleanup = *ehStack.begin();
+
+  // The eventual implementation here will use the EHCleanupScope helper class.
+  assert(!cir::MissingFeatures::ehCleanupScope());
+
   mlir::OpBuilder::InsertionGuard guard(builder);
-  std::unique_ptr<EHScopeStack::Cleanup> cleanup =
-      ehStack.cleanupStack.pop_back_val();
 
   assert(!cir::MissingFeatures::ehCleanupFlags());
   mlir::Block *cleanupEntry = getCurCleanupBlock(*this);
   builder.setInsertionPointToEnd(cleanupEntry);
-  cleanup->emit(*this);
+  cleanup.emit(*this);
+
+  ehStack.deallocate(cleanup.getSize());
 }
 
 /// Pops cleanup blocks until the given savepoint is reached.
-void CIRGenFunction::popCleanupBlocks(size_t oldCleanupStackDepth) {
+void CIRGenFunction::popCleanupBlocks(
+    EHScopeStack::stable_iterator oldCleanupStackDepth) {
   assert(!cir::MissingFeatures::ehstackBranches());
 
-  assert(ehStack.getStackDepth() >= oldCleanupStackDepth);
-
   // Pop cleanup blocks until we reach the base stack depth for the
   // current scope.
-  while (ehStack.getStackDepth() > oldCleanupStackDepth) {
+  while (ehStack.stable_begin() != oldCleanupStackDepth) {
     popCleanupBlock();
   }
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.h b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
new file mode 100644
index 0000000..7361c8c
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
@@ -0,0 +1,43 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// These classes support the generation of CIR for cleanups, initially based
+// on LLVM IR cleanup handling, but ought to change as CIR evolves.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_LIB_CIR_CODEGEN_CIRGENCLEANUP_H
+#define CLANG_LIB_CIR_CODEGEN_CIRGENCLEANUP_H
+
+#include "EHScopeStack.h"
+
+namespace clang::CIRGen {
+
+/// A non-stable pointer into the scope stack.
+class EHScopeStack::iterator {
+  char *ptr = nullptr;
+
+  friend class EHScopeStack;
+  explicit iterator(char *ptr) : ptr(ptr) {}
+
+public:
+  iterator() = default;
+
+  EHScopeStack::Cleanup *get() const {
+    return reinterpret_cast<EHScopeStack::Cleanup *>(ptr);
+  }
+
+  EHScopeStack::Cleanup &operator*() const { return *get(); }
+};
+
+inline EHScopeStack::iterator EHScopeStack::begin() const {
+  return iterator(startOfData);
+}
+
+} // namespace clang::CIRGen
+#endif // CLANG_LIB_CIR_CODEGEN_CIRGENCLEANUP_H
diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index 78d375c..715d101 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -667,6 +667,12 @@ struct DestroyObject final : EHScopeStack::Cleanup {
   void emit(CIRGenFunction &cgf) override {
     cgf.emitDestroy(addr, type, destroyer);
   }
+
+  // This is a placeholder until EHCleanupScope is implemented.
+  size_t getSize() const override {
+    assert(!cir::MissingFeatures::ehCleanupScope());
+    return sizeof(DestroyObject);
+  }
 };
 } // namespace
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index c22cf60..cba06a1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -171,6 +171,10 @@ public:
   mlir::Value VisitBinSubAssign(const CompoundAssignOperator *e) {
     return emitCompoundAssign(e, &ComplexExprEmitter::emitBinSub);
   }
+
+  mlir::Value VisitBinMulAssign(const CompoundAssignOperator *e) {
+    return emitCompoundAssign(e, &ComplexExprEmitter::emitBinMul);
+  }
 };
 } // namespace
 
@@ -776,7 +780,7 @@ getComplexRangeAttr(LangOptions::ComplexRangeKind range) {
   case LangOptions::CX_Basic:
     return cir::ComplexRangeKind::Basic;
   case LangOptions::CX_None:
-    // The default value for ComplexRangeKind is Full is no option is selected
+    // The default value for ComplexRangeKind is Full if no option is selected
     return cir::ComplexRangeKind::Full;
   }
 }
@@ -813,7 +817,7 @@ using CompoundFunc =
 static CompoundFunc getComplexOp(BinaryOperatorKind op) {
   switch (op) {
   case BO_MulAssign:
-    llvm_unreachable("getComplexOp: BO_MulAssign");
+    return &ComplexExprEmitter::emitBinMul;
   case BO_DivAssign:
     llvm_unreachable("getComplexOp: BO_DivAssign");
   case BO_SubAssign:
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index e93dc0b..dedd01c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -28,8 +28,6 @@ CIRGenFunction::CIRGenFunction(CIRGenModule &cgm, CIRGenBuilderTy &builder,
                                bool suppressNewContext)
     : CIRGenTypeCache(cgm), cgm{cgm}, builder(builder) {
   ehStack.setCGF(this);
-  currentCleanupStackDepth = 0;
-  assert(ehStack.getStackDepth() == 0);
 }
 
 CIRGenFunction::~CIRGenFunction() {}
@@ -409,6 +407,8 @@ void CIRGenFunction::startFunction(GlobalDecl gd, QualType returnType,
   const auto *fd = dyn_cast_or_null<FunctionDecl>(d);
   curFuncDecl = d->getNonClosureContext();
 
+  prologueCleanupDepth = ehStack.stable_begin();
+
   mlir::Block *entryBB = &fn.getBlocks().front();
   builder.setInsertionPointToStart(entryBB);
 
@@ -475,11 +475,11 @@ void CIRGenFunction::finishFunction(SourceLocation endLoc) {
   // important to do this before we enter the return block or return
   // edges will be *really* confused.
   // TODO(cir): Use prologueCleanupDepth here.
-  bool hasCleanups = ehStack.getStackDepth() != currentCleanupStackDepth;
+  bool hasCleanups = ehStack.stable_begin() != prologueCleanupDepth;
   if (hasCleanups) {
     assert(!cir::MissingFeatures::generateDebugInfo());
     // FIXME(cir): should we clearInsertionPoint? breaks many testcases
-    popCleanupBlocks(currentCleanupStackDepth);
+    popCleanupBlocks(prologueCleanupDepth);
   }
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 2e60cfc..bdbc77c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -601,9 +601,13 @@ public:
                      FunctionArgList args, clang::SourceLocation loc,
                      clang::SourceLocation startLoc);
 
+  /// The cleanup depth enclosing all the cleanups associated with the
+  /// parameters.
+  EHScopeStack::stable_iterator prologueCleanupDepth;
+
   /// Takes the old cleanup stack size and emits the cleanup blocks
   /// that have been added.
-  void popCleanupBlocks(size_t oldCleanupStackDepth);
+  void popCleanupBlocks(EHScopeStack::stable_iterator oldCleanupStackDepth);
   void popCleanupBlock();
 
   /// Push a cleanup to be run at the end of the current full-expression.  Safe
@@ -622,7 +626,7 @@ public:
   /// Enters a new scope for capturing cleanups, all of which
   /// will be executed once the scope is exited.
   class RunCleanupsScope {
-    size_t cleanupStackDepth, oldCleanupStackDepth;
+    EHScopeStack::stable_iterator cleanupStackDepth, oldCleanupStackDepth;
 
   protected:
     bool performCleanup;
@@ -638,7 +642,7 @@ public:
     /// Enter a new cleanup scope.
     explicit RunCleanupsScope(CIRGenFunction &cgf)
         : performCleanup(true), cgf(cgf) {
-      cleanupStackDepth = cgf.ehStack.getStackDepth();
+      cleanupStackDepth = cgf.ehStack.stable_begin();
       oldCleanupStackDepth = cgf.currentCleanupStackDepth;
       cgf.currentCleanupStackDepth = cleanupStackDepth;
     }
@@ -663,7 +667,7 @@ public:
   };
 
   // Cleanup stack depth of the RunCleanupsScope that was pushed most recently.
-  size_t currentCleanupStackDepth;
+  EHScopeStack::stable_iterator currentCleanupStackDepth = ehStack.stable_end();
 
 public:
   /// Represents a scope, including function bodies, compound statements, and
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 425250d..ff6d293 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -1365,6 +1365,21 @@ void CIRGenModule::emitTopLevelDecl(Decl *decl) {
     assert(!cir::MissingFeatures::generateDebugInfo());
     assert(!cir::MissingFeatures::cxxRecordStaticMembers());
     break;
+
+  case Decl::FileScopeAsm:
+    // File-scope asm is ignored during device-side CUDA compilation.
+    if (langOpts.CUDA && langOpts.CUDAIsDevice)
+      break;
+    // File-scope asm is ignored during device-side OpenMP compilation.
+    if (langOpts.OpenMPIsTargetDevice)
+      break;
+    // File-scope asm is ignored during device-side SYCL compilation.
+    if (langOpts.SYCLIsDevice)
+      break;
+    auto *file_asm = cast<FileScopeAsmDecl>(decl);
+    std::string line = file_asm->getAsmString();
+    globalScopeAsm.push_back(builder.getStringAttr(line));
+    break;
   }
 }
 
@@ -1978,6 +1993,9 @@ void CIRGenModule::release() {
   emitDeferred();
   applyReplacements();
 
+  theModule->setAttr(cir::CIRDialect::getModuleLevelAsmAttrName(),
+                     builder.getArrayAttr(globalScopeAsm));
+
   // There's a lot of code that is not implemented yet.
   assert(!cir::MissingFeatures::cgmRelease());
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 5d07d38..163a0fc 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -90,6 +90,8 @@ private:
   /// for FunctionDecls's.
   CIRGenFunction *curCGF = nullptr;
 
+  llvm::SmallVector<mlir::Attribute> globalScopeAsm;
+
 public:
   mlir::ModuleOp getModule() const { return theModule; }
   CIRGenBuilderTy &getBuilder() { return builder; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index 50642e7..332babd 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -412,7 +412,7 @@ mlir::LogicalResult CIRGenFunction::emitReturnStmt(const ReturnStmt &s) {
   auto *retBlock = curLexScope->getOrCreateRetBlock(*this, loc);
   // This should emit a branch through the cleanup block if one exists.
   builder.create<cir::BrOp>(loc, retBlock);
-  if (ehStack.getStackDepth() != currentCleanupStackDepth)
+  if (ehStack.stable_begin() != currentCleanupStackDepth)
     cgm.errorNYI(s.getSourceRange(), "return with cleanup stack");
   builder.createBlock(builder.getBlock()->getParent());
 
diff --git a/clang/lib/CIR/CodeGen/EHScopeStack.h b/clang/lib/CIR/CodeGen/EHScopeStack.h
index 22750ac..47478f6 100644
--- a/clang/lib/CIR/CodeGen/EHScopeStack.h
+++ b/clang/lib/CIR/CodeGen/EHScopeStack.h
@@ -42,7 +42,47 @@ enum CleanupKind : unsigned {
 /// A stack of scopes which respond to exceptions, including cleanups
 /// and catch blocks.
 class EHScopeStack {
+  friend class CIRGenFunction;
+
 public:
+  // TODO(ogcg): Switch to alignof(uint64_t) instead of 8
+  enum { ScopeStackAlignment = 8 };
+
+  /// A saved depth on the scope stack.  This is necessary because
+  /// pushing scopes onto the stack invalidates iterators.
+  class stable_iterator {
+    friend class EHScopeStack;
+
+    /// Offset from startOfData to endOfBuffer.
+    ptrdiff_t size = -1;
+
+    explicit stable_iterator(ptrdiff_t size) : size(size) {}
+
+  public:
+    static stable_iterator invalid() { return stable_iterator(-1); }
+    stable_iterator() = default;
+
+    bool isValid() const { return size >= 0; }
+
+    /// Returns true if this scope encloses I.
+    /// Returns false if I is invalid.
+    /// This scope must be valid.
+    bool encloses(stable_iterator other) const { return size <= other.size; }
+
+    /// Returns true if this scope strictly encloses I: that is,
+    /// if it encloses I and is not I.
+    /// Returns false is I is invalid.
+    /// This scope must be valid.
+    bool strictlyEncloses(stable_iterator I) const { return size < I.size; }
+
+    friend bool operator==(stable_iterator A, stable_iterator B) {
+      return A.size == B.size;
+    }
+    friend bool operator!=(stable_iterator A, stable_iterator B) {
+      return A.size != B.size;
+    }
+  };
+
   /// Information for lazily generating a cleanup.  Subclasses must be
   /// POD-like: cleanups will not be destructed, and they will be
   /// allocated on the cleanup stack and freely copied and moved
@@ -68,30 +108,75 @@ public:
     ///
     // \param flags cleanup kind.
     virtual void emit(CIRGenFunction &cgf) = 0;
-  };
 
-  // Classic codegen has a finely tuned custom allocator and a complex stack
-  // management scheme. We'll probably eventually want to find a way to share
-  // that implementation. For now, we will use a very simplified implementation
-  // to get cleanups working.
-  llvm::SmallVector<std::unique_ptr<Cleanup>, 8> cleanupStack;
+    // This is a placeholder until EHScope is implemented.
+    virtual size_t getSize() const = 0;
+  };
 
 private:
+  // The implementation for this class is in CIRGenCleanup.h and
+  // CIRGenCleanup.cpp; the definition is here because it's used as a
+  // member of CIRGenFunction.
+
+  /// The start of the scope-stack buffer, i.e. the allocated pointer
+  /// for the buffer.  All of these pointers are either simultaneously
+  /// null or simultaneously valid.
+  std::unique_ptr<char[]> startOfBuffer;
+
+  /// The end of the buffer.
+  char *endOfBuffer = nullptr;
+
+  /// The first valid entry in the buffer.
+  char *startOfData = nullptr;
+
   /// The CGF this Stack belong to
   CIRGenFunction *cgf = nullptr;
 
+  // This class uses a custom allocator for maximum efficiency because cleanups
+  // are allocated and freed very frequently. It's basically a bump pointer
+  // allocator, but we can't use LLVM's BumpPtrAllocator because we use offsets
+  // into the buffer as stable iterators.
+  char *allocate(size_t size);
+  void deallocate(size_t size);
+
+  void *pushCleanup(CleanupKind kind, size_t dataSize);
+
 public:
   EHScopeStack() = default;
   ~EHScopeStack() = default;
 
   /// Push a lazily-created cleanup on the stack.
   template <class T, class... As> void pushCleanup(CleanupKind kind, As... a) {
-    cleanupStack.push_back(std::make_unique<T>(a...));
+    static_assert(alignof(T) <= ScopeStackAlignment,
+                  "Cleanup's alignment is too large.");
+    void *buffer = pushCleanup(kind, sizeof(T));
+    [[maybe_unused]] Cleanup *obj = new (buffer) T(a...);
   }
 
   void setCGF(CIRGenFunction *inCGF) { cgf = inCGF; }
 
-  size_t getStackDepth() const { return cleanupStack.size(); }
+  /// Pops a cleanup scope off the stack.  This is private to CIRGenCleanup.cpp.
+  void popCleanup();
+
+  /// Determines whether the exception-scopes stack is empty.
+  bool empty() const { return startOfData == endOfBuffer; }
+
+  /// An unstable reference to a scope-stack depth.  Invalidated by
+  /// pushes but not pops.
+  class iterator;
+
+  /// Returns an iterator pointing to the innermost EH scope.
+  iterator begin() const;
+
+  /// Create a stable reference to the top of the EH stack.  The
+  /// returned reference is valid until that scope is popped off the
+  /// stack.
+  stable_iterator stable_begin() const {
+    return stable_iterator(endOfBuffer - startOfData);
+  }
+
+  /// Create a stable reference to the bottom of the EH stack.
+  static stable_iterator stable_end() { return stable_iterator(0); }
 };
 
 } // namespace clang::CIRGen
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index d3fcac1..53ab04e 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1444,6 +1444,27 @@ cir::GetGlobalOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
 }
 
 //===----------------------------------------------------------------------===//
+// VTableAddrPointOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+cir::VTableAddrPointOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
+  StringRef name = getName();
+
+  // Verify that the result type underlying pointer type matches the type of
+  // the referenced cir.global or cir.func op.
+  auto op = symbolTable.lookupNearestSymbolFrom<GlobalOp>(*this, getNameAttr());
+  if (!op)
+    return emitOpError("'")
+           << name << "' does not reference a valid cir.global";
+  std::optional<mlir::Attribute> init = op.getInitialValue();
+  if (!init)
+    return success();
+  assert(!cir::MissingFeatures::vtableInitializer());
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
 // FuncOp
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 7e1c9fb..43a1b51 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -2143,6 +2143,11 @@ void ConvertCIRToLLVMPass::processCIRAttrs(mlir::ModuleOp module) {
           module->getAttr(cir::CIRDialect::getTripleAttrName()))
     module->setAttr(mlir::LLVM::LLVMDialect::getTargetTripleAttrName(),
                     tripleAttr);
+
+  if (mlir::Attribute asmAttr =
+          module->getAttr(cir::CIRDialect::getModuleLevelAsmAttrName()))
+    module->setAttr(mlir::LLVM::LLVMDialect::getModuleLevelAsmAttrName(),
+                    asmAttr);
 }
 
 void ConvertCIRToLLVMPass::runOnOperation() {
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index f64ac20..918cb3e 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -103,13 +103,6 @@ llvm::Triple::ArchType CGHLSLRuntime::getArch() {
   return CGM.getTarget().getTriple().getArch();
 }
 
-// Returns true if the type is an HLSL resource class or an array of them
-static bool isResourceRecordTypeOrArrayOf(const clang::Type *Ty) {
-  while (const ConstantArrayType *CAT = dyn_cast<ConstantArrayType>(Ty))
-    Ty = CAT->getArrayElementTypeNoTypeQual();
-  return Ty->isHLSLResourceRecord();
-}
-
 // Emits constant global variables for buffer constants declarations
 // and creates metadata linking the constant globals with the buffer global.
 void CGHLSLRuntime::emitBufferGlobalsAndMetadata(const HLSLBufferDecl *BufDecl,
@@ -146,7 +139,7 @@ void CGHLSLRuntime::emitBufferGlobalsAndMetadata(const HLSLBufferDecl *BufDecl,
     if (VDTy.getAddressSpace() != LangAS::hlsl_constant) {
       if (VD->getStorageClass() == SC_Static ||
           VDTy.getAddressSpace() == LangAS::hlsl_groupshared ||
-          isResourceRecordTypeOrArrayOf(VDTy.getTypePtr())) {
+          VDTy->isHLSLResourceRecord() || VDTy->isHLSLResourceRecordArray()) {
         // Emit static and groupshared variables and resource classes inside
         // cbuffer as regular globals
         CGM.EmitGlobal(VD);
diff --git a/clang/lib/CodeGen/TargetBuiltins/WebAssembly.cpp b/clang/lib/CodeGen/TargetBuiltins/WebAssembly.cpp
index 33a8d8f..1a1889a 100644
--- a/clang/lib/CodeGen/TargetBuiltins/WebAssembly.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/WebAssembly.cpp
@@ -246,35 +246,26 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
     llvm::FunctionType *LLVMFuncTy =
         cast<llvm::FunctionType>(ConvertType(QualType(FuncTy, 0)));
 
+    bool VarArg = LLVMFuncTy->isVarArg();
     unsigned NParams = LLVMFuncTy->getNumParams();
     std::vector<Value *> Args;
-    Args.reserve(NParams + 3);
+    Args.reserve(NParams + 3 + VarArg);
     // The only real argument is the FuncRef
     Args.push_back(FuncRef);
 
     // Add the type information
-    auto addType = [this, &Args](llvm::Type *T) {
-      if (T->isVoidTy()) {
-        // Do nothing
-      } else if (T->isFloatingPointTy()) {
-        Args.push_back(ConstantFP::get(T, 0));
-      } else if (T->isIntegerTy()) {
-        Args.push_back(ConstantInt::get(T, 0));
-      } else if (T->isPointerTy()) {
-        Args.push_back(ConstantPointerNull::get(llvm::PointerType::get(
-            getLLVMContext(), T->getPointerAddressSpace())));
-      } else {
-        // TODO: Handle reference types. For now, we reject them in Sema.
-        llvm_unreachable("Unhandled type");
-      }
-    };
-
-    addType(LLVMFuncTy->getReturnType());
+    llvm::Type *RetType = LLVMFuncTy->getReturnType();
+    if (!RetType->isVoidTy()) {
+      Args.push_back(PoisonValue::get(RetType));
+    }
     // The token type indicates the boundary between return types and param
     // types.
     Args.push_back(PoisonValue::get(llvm::Type::getTokenTy(getLLVMContext())));
     for (unsigned i = 0; i < NParams; i++) {
-      addType(LLVMFuncTy->getParamType(i));
+      Args.push_back(PoisonValue::get(LLVMFuncTy->getParamType(i)));
+    }
+    if (VarArg) {
+      Args.push_back(PoisonValue::get(Builder.getPtrTy()));
     }
     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_ref_test_func);
     return Builder.CreateCall(Callee, Args);
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 9f77e62..ccc3154 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3936,47 +3936,18 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
     GenerateArg(Consumer, OPT_fsanitize_ignorelist_EQ, F);
 
   switch (Opts.getClangABICompat()) {
-  case LangOptions::ClangABI::Ver3_8:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "3.8");
+#define ABI_VER_MAJOR_MINOR(Major, Minor)                                      \
+  case LangOptions::ClangABI::Ver##Major##_##Minor:                            \
+    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, #Major "." #Minor);        \
     break;
-  case LangOptions::ClangABI::Ver4:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "4.0");
+#define ABI_VER_MAJOR(Major)                                                   \
+  case LangOptions::ClangABI::Ver##Major:                                      \
+    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, #Major ".0");              \
     break;
-  case LangOptions::ClangABI::Ver6:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "6.0");
-    break;
-  case LangOptions::ClangABI::Ver7:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "7.0");
-    break;
-  case LangOptions::ClangABI::Ver9:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "9.0");
-    break;
-  case LangOptions::ClangABI::Ver11:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "11.0");
-    break;
-  case LangOptions::ClangABI::Ver12:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "12.0");
-    break;
-  case LangOptions::ClangABI::Ver14:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "14.0");
-    break;
-  case LangOptions::ClangABI::Ver15:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "15.0");
-    break;
-  case LangOptions::ClangABI::Ver17:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "17.0");
-    break;
-  case LangOptions::ClangABI::Ver18:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "18.0");
-    break;
-  case LangOptions::ClangABI::Ver19:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "19.0");
-    break;
-  case LangOptions::ClangABI::Ver20:
-    GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "20.0");
-    break;
-  case LangOptions::ClangABI::Latest:
+#define ABI_VER_LATEST(Latest)                                                 \
+  case LangOptions::ClangABI::Latest:                                          \
     break;
+#include "clang/Basic/ABIVersions.def"
   }
 
   if (Opts.getSignReturnAddressScope() ==
@@ -4482,32 +4453,18 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
                    !VerParts.second.getAsInteger(10, Minor)
              : VerParts.first.size() == Ver.size() || VerParts.second == "0")) {
       // Got a valid version number.
-      if (Major == 3 && Minor <= 8)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver3_8);
-      else if (Major <= 4)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver4);
-      else if (Major <= 6)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver6);
-      else if (Major <= 7)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver7);
-      else if (Major <= 9)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver9);
-      else if (Major <= 11)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver11);
-      else if (Major <= 12)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver12);
-      else if (Major <= 14)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver14);
-      else if (Major <= 15)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver15);
-      else if (Major <= 17)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver17);
-      else if (Major <= 18)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver18);
-      else if (Major <= 19)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver19);
-      else if (Major <= 20)
-        Opts.setClangABICompat(LangOptions::ClangABI::Ver20);
+#define ABI_VER_MAJOR_MINOR(Major_, Minor_)                                    \
+  if (std::tie(Major, Minor) <= std::tuple(Major_, Minor_))                    \
+    Opts.setClangABICompat(LangOptions::ClangABI::Ver##Major_##_##Minor_);     \
+  else
+#define ABI_VER_MAJOR(Major_)                                                  \
+  if (Major <= Major_)                                                         \
+    Opts.setClangABICompat(LangOptions::ClangABI::Ver##Major_);                \
+  else
+#define ABI_VER_LATEST(Latest)                                                 \
+  { /* Equivalent to latest version - do nothing */                            \
+  }
+#include "clang/Basic/ABIVersions.def"
     } else if (Ver != "latest") {
       Diags.Report(diag::err_drv_invalid_value)
           << A->getAsString(Args) << A->getValue();
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 440552c..74343c3 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -835,45 +835,38 @@ _mm512_xor_si512(__m512i __a, __m512i __b)
 
 /* Arithmetic */
 
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_add_pd(__m512d __a, __m512d __b)
-{
+static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_add_pd(__m512d __a, __m512d __b) {
   return (__m512d)((__v8df)__a + (__v8df)__b);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_add_ps(__m512 __a, __m512 __b)
-{
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_add_ps(__m512 __a, __m512 __b) {
   return (__m512)((__v16sf)__a + (__v16sf)__b);
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_mul_pd(__m512d __a, __m512d __b)
-{
+static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mul_pd(__m512d __a, __m512d __b) {
   return (__m512d)((__v8df)__a * (__v8df)__b);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_mul_ps(__m512 __a, __m512 __b)
-{
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mul_ps(__m512 __a, __m512 __b) {
   return (__m512)((__v16sf)__a * (__v16sf)__b);
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_sub_pd(__m512d __a, __m512d __b)
-{
+static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_sub_pd(__m512d __a, __m512d __b) {
   return (__m512d)((__v8df)__a - (__v8df)__b);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_sub_ps(__m512 __a, __m512 __b)
-{
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_sub_ps(__m512 __a, __m512 __b) {
   return (__m512)((__v16sf)__a - (__v16sf)__b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_add_epi64 (__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_add_epi64(__m512i __A, __m512i __B) {
   return (__m512i) ((__v8du) __A + (__v8du) __B);
 }
 
@@ -2315,9 +2308,8 @@ _mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
                                             (__v2df)_mm_setzero_pd(), \
                                             (__mmask8)(U), (int)(R)))
 
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_div_pd(__m512d __a, __m512d __b)
-{
+static __inline __m512d
+    __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_div_pd(__m512d __a, __m512d __b) {
   return (__m512d)((__v8df)__a/(__v8df)__b);
 }
 
@@ -2335,9 +2327,8 @@ _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
                                               (__v8df)_mm512_setzero_pd());
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_div_ps(__m512 __a, __m512 __b)
-{
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_div_ps(__m512 __a, __m512 __b) {
   return (__m512)((__v16sf)__a/(__v16sf)__b);
 }
 
@@ -4123,9 +4114,8 @@ _mm512_cvtss_f32(__m512 __a)
 
 /* Unpack and Interleave */
 
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_pd(__m512d __a, __m512d __b)
-{
+static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_unpackhi_pd(__m512d __a, __m512d __b) {
   return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
                                           1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
 }
@@ -4146,9 +4136,8 @@ _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B)
                                            (__v8df)_mm512_setzero_pd());
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_pd(__m512d __a, __m512d __b)
-{
+static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_unpacklo_pd(__m512d __a, __m512d __b) {
   return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
                                           0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
 }
@@ -4169,9 +4158,8 @@ _mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
                                            (__v8df)_mm512_setzero_pd());
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_ps(__m512 __a, __m512 __b)
-{
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_unpackhi_ps(__m512 __a, __m512 __b) {
   return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
                                          2,    18,    3,    19,
                                          2+4,  18+4,  3+4,  19+4,
@@ -4195,9 +4183,8 @@ _mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
                                           (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_ps(__m512 __a, __m512 __b)
-{
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_unpacklo_ps(__m512 __a, __m512 __b) {
   return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
                                          0,    16,    1,    17,
                                          0+4,  16+4,  1+4,  17+4,
@@ -9337,19 +9324,23 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
  * This takes log2(n) steps where n is the number of elements in the vector.
  */
 
-static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
+static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_reduce_add_epi64(__m512i __W) {
   return __builtin_reduce_add((__v8di)__W);
 }
 
-static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
+static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_reduce_mul_epi64(__m512i __W) {
   return __builtin_reduce_mul((__v8di)__W);
 }
 
-static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
+static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_reduce_and_epi64(__m512i __W) {
   return __builtin_reduce_and((__v8di)__W);
 }
 
-static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) {
+static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_reduce_or_epi64(__m512i __W) {
   return __builtin_reduce_or((__v8di)__W);
 }
 
@@ -9400,22 +9391,22 @@ _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
   return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS512
+static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_add_epi32(__m512i __W) {
   return __builtin_reduce_add((__v16si)__W);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS512
+static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_mul_epi32(__m512i __W) {
   return __builtin_reduce_mul((__v16si)__W);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS512
+static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_and_epi32(__m512i __W) {
   return __builtin_reduce_and((__v16si)__W);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS512
+static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_or_epi32(__m512i __W) {
   return __builtin_reduce_or((__v16si)__W);
 }
@@ -9466,22 +9457,22 @@ _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
   return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
 }
 
-static __inline__ long long __DEFAULT_FN_ATTRS512
+static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_max_epi64(__m512i __V) {
   return __builtin_reduce_max((__v8di)__V);
 }
 
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_max_epu64(__m512i __V) {
   return __builtin_reduce_max((__v8du)__V);
 }
 
-static __inline__ long long __DEFAULT_FN_ATTRS512
+static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_min_epi64(__m512i __V) {
   return __builtin_reduce_min((__v8di)__V);
 }
 
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_min_epu64(__m512i __V) {
   return __builtin_reduce_min((__v8du)__V);
 }
@@ -9509,22 +9500,22 @@ _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
   __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __V);
   return __builtin_reduce_min((__v8du)__V);
 }
-static __inline__ int __DEFAULT_FN_ATTRS512
+static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_max_epi32(__m512i __V) {
   return __builtin_reduce_max((__v16si)__V);
 }
 
-static __inline__ unsigned int __DEFAULT_FN_ATTRS512
+static __inline__ unsigned int __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_max_epu32(__m512i __V) {
   return __builtin_reduce_max((__v16su)__V);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS512
+static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_min_epi32(__m512i __V) {
   return __builtin_reduce_min((__v16si)__V);
 }
 
-static __inline__ unsigned int __DEFAULT_FN_ATTRS512
+static __inline__ unsigned int __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_reduce_min_epu32(__m512i __V) {
   return __builtin_reduce_min((__v16su)__V);
 }
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index 1da50f0..2be4f68 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -87,9 +87,8 @@ typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
 ///    A 256-bit vector of [4 x double] containing one of the source operands.
 /// \returns A 256-bit vector of [4 x double] containing the sums of both
 ///    operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_add_pd(__m256d __a, __m256d __b)
-{
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_add_pd(__m256d __a, __m256d __b) {
   return (__m256d)((__v4df)__a+(__v4df)__b);
 }
 
@@ -105,9 +104,8 @@ _mm256_add_pd(__m256d __a, __m256d __b)
 ///    A 256-bit vector of [8 x float] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x float] containing the sums of both
 ///    operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_add_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_ps(__m256 __a,
+                                                                  __m256 __b) {
   return (__m256)((__v8sf)__a+(__v8sf)__b);
 }
 
@@ -123,9 +121,8 @@ _mm256_add_ps(__m256 __a, __m256 __b)
 ///    A 256-bit vector of [4 x double] containing the subtrahend.
 /// \returns A 256-bit vector of [4 x double] containing the differences between
 ///    both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_sub_pd(__m256d __a, __m256d __b)
-{
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_sub_pd(__m256d __a, __m256d __b) {
   return (__m256d)((__v4df)__a-(__v4df)__b);
 }
 
@@ -141,9 +138,8 @@ _mm256_sub_pd(__m256d __a, __m256d __b)
 ///    A 256-bit vector of [8 x float] containing the subtrahend.
 /// \returns A 256-bit vector of [8 x float] containing the differences between
 ///    both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_sub_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_ps(__m256 __a,
+                                                                  __m256 __b) {
   return (__m256)((__v8sf)__a-(__v8sf)__b);
 }
 
@@ -197,9 +193,8 @@ _mm256_addsub_ps(__m256 __a, __m256 __b)
 ///    A 256-bit vector of [4 x double] containing the divisor.
 /// \returns A 256-bit vector of [4 x double] containing the quotients of both
 ///    operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_div_pd(__m256d __a, __m256d __b)
-{
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_div_pd(__m256d __a, __m256d __b) {
   return (__m256d)((__v4df)__a/(__v4df)__b);
 }
 
@@ -215,9 +210,8 @@ _mm256_div_pd(__m256d __a, __m256d __b)
 ///    A 256-bit vector of [8 x float] containing the divisor.
 /// \returns A 256-bit vector of [8 x float] containing the quotients of both
 ///    operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_div_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_ps(__m256 __a,
+                                                                  __m256 __b) {
   return (__m256)((__v8sf)__a/(__v8sf)__b);
 }
 
@@ -317,9 +311,8 @@ _mm256_min_ps(__m256 __a, __m256 __b)
 ///    A 256-bit vector of [4 x double] containing one of the operands.
 /// \returns A 256-bit vector of [4 x double] containing the products of both
 ///    operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_mul_pd(__m256d __a, __m256d __b)
-{
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_mul_pd(__m256d __a, __m256d __b) {
   return (__m256d)((__v4df)__a * (__v4df)__b);
 }
 
@@ -335,9 +328,8 @@ _mm256_mul_pd(__m256d __a, __m256d __b)
 ///    A 256-bit vector of [8 x float] containing one of the operands.
 /// \returns A 256-bit vector of [8 x float] containing the products of both
 ///    operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_mul_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
+                                                                  __m256 __b) {
   return (__m256)((__v8sf)__a * (__v8sf)__b);
 }
 
@@ -2462,9 +2454,8 @@ _mm256_movedup_pd(__m256d __a)
 ///    Bits [127:64] are written to bits [127:64] of the return value. \n
 ///    Bits [255:192] are written to bits [255:192] of the return value. \n
 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_unpackhi_pd(__m256d __a, __m256d __b)
-{
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_unpackhi_pd(__m256d __a, __m256d __b) {
   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
 }
 
@@ -2484,9 +2475,8 @@ _mm256_unpackhi_pd(__m256d __a, __m256d __b)
 ///    Bits [63:0] are written to bits [127:64] of the return value. \n
 ///    Bits [191:128] are written to bits [255:192] of the return value. \n
 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_unpacklo_pd(__m256d __a, __m256d __b)
-{
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_unpacklo_pd(__m256d __a, __m256d __b) {
   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
 }
 
@@ -2511,9 +2501,8 @@ _mm256_unpacklo_pd(__m256d __a, __m256d __b)
 ///    Bits [223:192] are written to bits [191:160] of the return value. \n
 ///    Bits [255:224] are written to bits [255:224] of the return value.
 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_unpackhi_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_unpackhi_ps(__m256 __a, __m256 __b) {
   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
 }
 
@@ -2538,9 +2527,8 @@ _mm256_unpackhi_ps(__m256 __a, __m256 __b)
 ///    Bits [159:128] are written to bits [191:160] of the return value. \n
 ///    Bits [191:160] are written to bits [255:224] of the return value.
 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_unpacklo_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_unpacklo_ps(__m256 __a, __m256 __b) {
   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
 }
 
diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h
index 52addb7..ce8c79e 100644
--- a/clang/lib/Headers/cpuid.h
+++ b/clang/lib/Headers/cpuid.h
@@ -345,10 +345,15 @@ static __inline int __get_cpuid_count (unsigned int __leaf,
 // In some configurations, __cpuidex is defined as a builtin (primarily
 // -fms-extensions) which will conflict with the __cpuidex definition below.
 #if !(__has_builtin(__cpuidex))
+// In some cases, offloading will set the host as the aux triple and define the
+// builtin. Given __has_builtin does not detect builtins on aux triples, we need
+// to explicitly check for some offloading cases.
+#ifndef __NVPTX__
 static __inline void __cpuidex(int __cpu_info[4], int __leaf, int __subleaf) {
   __cpuid_count(__leaf, __subleaf, __cpu_info[0], __cpu_info[1], __cpu_info[2],
                 __cpu_info[3]);
 }
 #endif
+#endif
 
 #endif /* __CPUID_H */
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 8536e04..17f17f8 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -337,16 +337,9 @@ static bool isZeroSizedArray(const ConstantArrayType *CAT) {
   return CAT != nullptr;
 }
 
-// Returns true if the record type is an HLSL resource class or an array of
-// resource classes
-static bool isResourceRecordTypeOrArrayOf(const Type *Ty) {
-  while (const ConstantArrayType *CAT = dyn_cast<ConstantArrayType>(Ty))
-    Ty = CAT->getArrayElementTypeNoTypeQual();
-  return HLSLAttributedResourceType::findHandleTypeOnResource(Ty) != nullptr;
-}
-
 static bool isResourceRecordTypeOrArrayOf(VarDecl *VD) {
-  return isResourceRecordTypeOrArrayOf(VD->getType().getTypePtr());
+  const Type *Ty = VD->getType().getTypePtr();
+  return Ty->isHLSLResourceRecord() || Ty->isHLSLResourceRecordArray();
 }
 
 // Returns true if the type is a leaf element type that is not valid to be
@@ -355,7 +348,7 @@ static bool isResourceRecordTypeOrArrayOf(VarDecl *VD) {
 // type or if it is a record type that needs to be inspected further.
 static bool isInvalidConstantBufferLeafElementType(const Type *Ty) {
   Ty = Ty->getUnqualifiedDesugaredType();
-  if (isResourceRecordTypeOrArrayOf(Ty))
+  if (Ty->isHLSLResourceRecord() || Ty->isHLSLResourceRecordArray())
     return true;
   if (Ty->isRecordType())
     return Ty->getAsCXXRecordDecl()->isEmpty();
@@ -3597,7 +3590,7 @@ void SemaHLSL::deduceAddressSpace(VarDecl *Decl) {
     return;
 
   // Resource handles.
-  if (isResourceRecordTypeOrArrayOf(Type->getUnqualifiedDesugaredType()))
+  if (Type->isHLSLResourceRecord() || Type->isHLSLResourceRecordArray())
     return;
 
   // Only static globals belong to the Private address space.
@@ -3637,10 +3630,7 @@ void SemaHLSL::ActOnVariableDeclarator(VarDecl *VD) {
     if (VD->getType()->isHLSLIntangibleType())
       collectResourceBindingsOnVarDecl(VD);
 
-    const Type *VarType = VD->getType().getTypePtr();
-    while (VarType->isArrayType())
-      VarType = VarType->getArrayElementTypeNoTypeQual();
-    if (VarType->isHLSLResourceRecord() ||
+    if (isResourceRecordTypeOrArrayOf(VD) ||
         VD->hasAttr<HLSLVkConstantIdAttr>()) {
       // Make the variable for resources static. The global externally visible
       // storage is accessed through the handle, which is a member. The variable
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index b6b8932..2d8fdb5 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -367,7 +367,7 @@ bool Sema::DiagnoseUnknownTemplateName(const IdentifierInfo &II,
 
   // The code is missing a 'template' keyword prior to the dependent template
   // name.
-  NestedNameSpecifier *Qualifier = (NestedNameSpecifier *)SS->getScopeRep();
+  NestedNameSpecifier *Qualifier = SS->getScopeRep();
   SuggestedTemplate = TemplateTy::make(Context.getDependentTemplateName(
       {Qualifier, &II, /*HasTemplateKeyword=*/false}));
   Diag(IILoc, diag::err_template_kw_missing)
diff --git a/clang/lib/Sema/SemaWasm.cpp b/clang/lib/Sema/SemaWasm.cpp
index 8998492..e773113 100644
--- a/clang/lib/Sema/SemaWasm.cpp
+++ b/clang/lib/Sema/SemaWasm.cpp
@@ -17,6 +17,7 @@
 #include "clang/Basic/AddressSpaces.h"
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "clang/Basic/TargetInfo.h"
 #include "clang/Sema/Attr.h"
 #include "clang/Sema/Sema.h"
 
@@ -227,7 +228,8 @@ bool SemaWasm::BuiltinWasmTableCopy(CallExpr *TheCall) {
   return false;
 }
 
-bool SemaWasm::BuiltinWasmTestFunctionPointerSignature(CallExpr *TheCall) {
+bool SemaWasm::BuiltinWasmTestFunctionPointerSignature(const TargetInfo &TI,
+                                                       CallExpr *TheCall) {
   if (SemaRef.checkArgCount(TheCall, 1))
     return true;
 
@@ -250,27 +252,31 @@ bool SemaWasm::BuiltinWasmTestFunctionPointerSignature(CallExpr *TheCall) {
            << ArgType << FuncPtrArg->getSourceRange();
   }
 
-  // Check that the function pointer doesn't use reference types
-  if (FuncTy->getReturnType().isWebAssemblyReferenceType()) {
-    return Diag(
-               FuncPtrArg->getBeginLoc(),
-               diag::err_wasm_builtin_test_fp_sig_cannot_include_reference_type)
-           << 0 << FuncTy->getReturnType() << FuncPtrArg->getSourceRange();
-  }
-  auto NParams = FuncTy->getNumParams();
-  for (unsigned I = 0; I < NParams; I++) {
-    if (FuncTy->getParamType(I).isWebAssemblyReferenceType()) {
+  if (TI.getABI() == "experimental-mv") {
+    auto isStructOrUnion = [](QualType T) {
+      return T->isUnionType() || T->isStructureType();
+    };
+    if (isStructOrUnion(FuncTy->getReturnType())) {
       return Diag(
                  FuncPtrArg->getBeginLoc(),
                  diag::
-                     err_wasm_builtin_test_fp_sig_cannot_include_reference_type)
-             << 1 << FuncPtrArg->getSourceRange();
+                     err_wasm_builtin_test_fp_sig_cannot_include_struct_or_union)
+             << 0 << FuncTy->getReturnType() << FuncPtrArg->getSourceRange();
+    }
+    auto NParams = FuncTy->getNumParams();
+    for (unsigned I = 0; I < NParams; I++) {
+      if (isStructOrUnion(FuncTy->getParamType(I))) {
+        return Diag(
+                   FuncPtrArg->getBeginLoc(),
+                   diag::
+                       err_wasm_builtin_test_fp_sig_cannot_include_struct_or_union)
+               << 1 << FuncPtrArg->getSourceRange();
+      }
     }
   }
 
   // Set return type to int (the result of the test)
   TheCall->setType(getASTContext().IntTy);
-
   return false;
 }
 
@@ -297,7 +303,7 @@ bool SemaWasm::CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI,
   case WebAssembly::BI__builtin_wasm_table_copy:
     return BuiltinWasmTableCopy(TheCall);
   case WebAssembly::BI__builtin_wasm_test_function_pointer_signature:
-    return BuiltinWasmTestFunctionPointerSignature(TheCall);
+    return BuiltinWasmTestFunctionPointerSignature(TI, TheCall);
   }
 
   return false;
diff --git a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
index 62bc321..65ff902 100644
--- a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
@@ -840,20 +840,27 @@ ProgramStateRef RetainCountChecker::updateSymbol(ProgramStateRef state,
 const RefCountBug &
 RetainCountChecker::errorKindToBugKind(RefVal::Kind ErrorKind,
                                        SymbolRef Sym) const {
+  const RefCountFrontend &FE = getPreferredFrontend();
+
   switch (ErrorKind) {
     case RefVal::ErrorUseAfterRelease:
-      return *UseAfterRelease;
+      return FE.UseAfterRelease;
     case RefVal::ErrorReleaseNotOwned:
-      return *ReleaseNotOwned;
+      return FE.ReleaseNotOwned;
     case RefVal::ErrorDeallocNotOwned:
       if (Sym->getType()->getPointeeCXXRecordDecl())
-        return *FreeNotOwned;
-      return *DeallocNotOwned;
+        return FE.FreeNotOwned;
+      return FE.DeallocNotOwned;
     default:
       llvm_unreachable("Unhandled error.");
   }
 }
 
+bool RetainCountChecker::isReleaseUnownedError(RefVal::Kind ErrorKind) const {
+  return ErrorKind == RefVal::ErrorReleaseNotOwned ||
+         ErrorKind == RefVal::ErrorDeallocNotOwned;
+}
+
 void RetainCountChecker::processNonLeakError(ProgramStateRef St,
                                              SourceRange ErrorRange,
                                              RefVal::Kind ErrorKind,
@@ -874,8 +881,8 @@ void RetainCountChecker::processNonLeakError(ProgramStateRef St,
     return;
 
   auto report = std::make_unique<RefCountReport>(
-      errorKindToBugKind(ErrorKind, Sym),
-      C.getASTContext().getLangOpts(), N, Sym);
+      errorKindToBugKind(ErrorKind, Sym), C.getASTContext().getLangOpts(), N,
+      Sym, /*isLeak=*/false, isReleaseUnownedError(ErrorKind));
   report->addRange(ErrorRange);
   C.emitReport(std::move(report));
 }
@@ -1090,8 +1097,8 @@ ExplodedNode * RetainCountChecker::checkReturnWithRetEffect(const ReturnStmt *S,
         ExplodedNode *N = C.addTransition(state, Pred);
         if (N) {
           const LangOptions &LOpts = C.getASTContext().getLangOpts();
-          auto R =
-              std::make_unique<RefLeakReport>(*LeakAtReturn, LOpts, N, Sym, C);
+          auto R = std::make_unique<RefLeakReport>(
+              getPreferredFrontend().LeakAtReturn, LOpts, N, Sym, C);
           C.emitReport(std::move(R));
         }
         return N;
@@ -1113,7 +1120,8 @@ ExplodedNode * RetainCountChecker::checkReturnWithRetEffect(const ReturnStmt *S,
         ExplodedNode *N = C.addTransition(state, Pred);
         if (N) {
           auto R = std::make_unique<RefCountReport>(
-              *ReturnNotOwnedForOwned, C.getASTContext().getLangOpts(), N, Sym);
+              getPreferredFrontend().ReturnNotOwnedForOwned,
+              C.getASTContext().getLangOpts(), N, Sym);
           C.emitReport(std::move(R));
         }
         return N;
@@ -1261,8 +1269,8 @@ ProgramStateRef RetainCountChecker::handleAutoreleaseCounts(
     os << "has a +" << V.getCount() << " retain count";
 
     const LangOptions &LOpts = Ctx.getASTContext().getLangOpts();
-    auto R = std::make_unique<RefCountReport>(*OverAutorelease, LOpts, N, Sym,
-                                              os.str());
+    auto R = std::make_unique<RefCountReport>(
+        getPreferredFrontend().OverAutorelease, LOpts, N, Sym, os.str());
     Ctx.emitReport(std::move(R));
   }
 
@@ -1307,8 +1315,10 @@ RetainCountChecker::processLeaks(ProgramStateRef state,
   const LangOptions &LOpts = Ctx.getASTContext().getLangOpts();
 
   if (N) {
+    const RefCountFrontend &FE = getPreferredFrontend();
+    const RefCountBug &BT = Pred ? FE.LeakWithinFunction : FE.LeakAtReturn;
+
     for (SymbolRef L : Leaked) {
-      const RefCountBug &BT = Pred ? *LeakWithinFunction : *LeakAtReturn;
       Ctx.emitReport(std::make_unique<RefLeakReport>(BT, LOpts, N, L, Ctx));
     }
   }
@@ -1463,44 +1473,31 @@ std::unique_ptr<SimpleProgramPointTag> RetainCountChecker::DeallocSentTag;
 std::unique_ptr<SimpleProgramPointTag> RetainCountChecker::CastFailTag;
 
 void ento::registerRetainCountBase(CheckerManager &Mgr) {
-  auto *Chk = Mgr.registerChecker<RetainCountChecker>();
+  auto *Chk = Mgr.getChecker<RetainCountChecker>();
   Chk->DeallocSentTag = std::make_unique<SimpleProgramPointTag>(
       "RetainCountChecker", "DeallocSent");
   Chk->CastFailTag = std::make_unique<SimpleProgramPointTag>(
       "RetainCountChecker", "DynamicCastFail");
 }
 
-bool ento::shouldRegisterRetainCountBase(const CheckerManager &mgr) {
+bool ento::shouldRegisterRetainCountBase(const CheckerManager &) {
   return true;
 }
+
 void ento::registerRetainCountChecker(CheckerManager &Mgr) {
   auto *Chk = Mgr.getChecker<RetainCountChecker>();
-  Chk->TrackObjCAndCFObjects = true;
+  Chk->RetainCount.enable(Mgr);
   Chk->TrackNSCFStartParam = Mgr.getAnalyzerOptions().getCheckerBooleanOption(
       Mgr.getCurrentCheckerName(), "TrackNSCFStartParam");
-
-#define INIT_BUGTYPE(KIND)                                                     \
-  Chk->KIND = std::make_unique<RefCountBug>(Mgr.getCurrentCheckerName(),       \
-                                            RefCountBug::KIND);
-  // TODO: Ideally, we should have a checker for each of these bug types.
-  INIT_BUGTYPE(UseAfterRelease)
-  INIT_BUGTYPE(ReleaseNotOwned)
-  INIT_BUGTYPE(DeallocNotOwned)
-  INIT_BUGTYPE(FreeNotOwned)
-  INIT_BUGTYPE(OverAutorelease)
-  INIT_BUGTYPE(ReturnNotOwnedForOwned)
-  INIT_BUGTYPE(LeakWithinFunction)
-  INIT_BUGTYPE(LeakAtReturn)
-#undef INIT_BUGTYPE
 }
 
-bool ento::shouldRegisterRetainCountChecker(const CheckerManager &mgr) {
+bool ento::shouldRegisterRetainCountChecker(const CheckerManager &) {
   return true;
 }
 
 void ento::registerOSObjectRetainCountChecker(CheckerManager &Mgr) {
   auto *Chk = Mgr.getChecker<RetainCountChecker>();
-  Chk->TrackOSObjects = true;
+  Chk->OSObjectRetainCount.enable(Mgr);
 
   // FIXME: We want bug reports to always have the same checker name associated
   // with them, yet here, if RetainCountChecker is disabled but
@@ -1511,21 +1508,8 @@ void ento::registerOSObjectRetainCountChecker(CheckerManager &Mgr) {
   // diagnostics, and **hidden checker options** with the fine-tuning of
   // modeling. Following this logic, OSObjectRetainCountChecker should be the
   // latter, but we can't just remove it for backward compatibility reasons.
-#define LAZY_INIT_BUGTYPE(KIND)                                                \
-  if (!Chk->KIND)                                                              \
-    Chk->KIND = std::make_unique<RefCountBug>(Mgr.getCurrentCheckerName(),     \
-                                              RefCountBug::KIND);
-  LAZY_INIT_BUGTYPE(UseAfterRelease)
-  LAZY_INIT_BUGTYPE(ReleaseNotOwned)
-  LAZY_INIT_BUGTYPE(DeallocNotOwned)
-  LAZY_INIT_BUGTYPE(FreeNotOwned)
-  LAZY_INIT_BUGTYPE(OverAutorelease)
-  LAZY_INIT_BUGTYPE(ReturnNotOwnedForOwned)
-  LAZY_INIT_BUGTYPE(LeakWithinFunction)
-  LAZY_INIT_BUGTYPE(LeakAtReturn)
-#undef LAZY_INIT_BUGTYPE
 }
 
-bool ento::shouldRegisterOSObjectRetainCountChecker(const CheckerManager &mgr) {
+bool ento::shouldRegisterOSObjectRetainCountChecker(const CheckerManager &) {
   return true;
 }
diff --git a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.h b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.h
index 0e81143..8854e10 100644
--- a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.h
+++ b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.h
@@ -235,51 +235,32 @@ public:
 };
 
 class RetainCountChecker
-  : public Checker< check::Bind,
-                    check::DeadSymbols,
-                    check::BeginFunction,
-                    check::EndFunction,
-                    check::PostStmt<BlockExpr>,
-                    check::PostStmt<CastExpr>,
-                    check::PostStmt<ObjCArrayLiteral>,
-                    check::PostStmt<ObjCDictionaryLiteral>,
-                    check::PostStmt<ObjCBoxedExpr>,
-                    check::PostStmt<ObjCIvarRefExpr>,
-                    check::PostCall,
-                    check::RegionChanges,
-                    eval::Assume,
-                    eval::Call > {
+    : public CheckerFamily<
+          check::Bind, check::DeadSymbols, check::BeginFunction,
+          check::EndFunction, check::PostStmt<BlockExpr>,
+          check::PostStmt<CastExpr>, check::PostStmt<ObjCArrayLiteral>,
+          check::PostStmt<ObjCDictionaryLiteral>,
+          check::PostStmt<ObjCBoxedExpr>, check::PostStmt<ObjCIvarRefExpr>,
+          check::PostCall, check::RegionChanges, eval::Assume, eval::Call> {
 
 public:
-  std::unique_ptr<RefCountBug> UseAfterRelease;
-  std::unique_ptr<RefCountBug> ReleaseNotOwned;
-  std::unique_ptr<RefCountBug> DeallocNotOwned;
-  std::unique_ptr<RefCountBug> FreeNotOwned;
-  std::unique_ptr<RefCountBug> OverAutorelease;
-  std::unique_ptr<RefCountBug> ReturnNotOwnedForOwned;
-  std::unique_ptr<RefCountBug> LeakWithinFunction;
-  std::unique_ptr<RefCountBug> LeakAtReturn;
+  RefCountFrontend RetainCount;
+  RefCountFrontend OSObjectRetainCount;
 
   mutable std::unique_ptr<RetainSummaryManager> Summaries;
 
   static std::unique_ptr<SimpleProgramPointTag> DeallocSentTag;
   static std::unique_ptr<SimpleProgramPointTag> CastFailTag;
 
-  /// Track Objective-C and CoreFoundation objects.
-  bool TrackObjCAndCFObjects = false;
-
-  /// Track sublcasses of OSObject.
-  bool TrackOSObjects = false;
-
   /// Track initial parameters (for the entry point) for NS/CF objects.
   bool TrackNSCFStartParam = false;
 
-  RetainCountChecker() {};
+  StringRef getDebugTag() const override { return "RetainCountChecker"; }
 
   RetainSummaryManager &getSummaryManager(ASTContext &Ctx) const {
     if (!Summaries)
-      Summaries.reset(
-          new RetainSummaryManager(Ctx, TrackObjCAndCFObjects, TrackOSObjects));
+      Summaries = std::make_unique<RetainSummaryManager>(
+          Ctx, RetainCount.isEnabled(), OSObjectRetainCount.isEnabled());
     return *Summaries;
   }
 
@@ -287,6 +268,15 @@ public:
     return getSummaryManager(C.getASTContext());
   }
 
+  const RefCountFrontend &getPreferredFrontend() const {
+    // FIXME: The two frontends of this checker family are in an unusual
+    // relationship: if they are both enabled, then all bug reports are
+    // reported by RetainCount (i.e. `osx.cocoa.RetainCount`), even the bugs
+    // that "belong to" OSObjectRetainCount (i.e. `osx.OSObjectRetainCount`).
+    // This is counter-intuitive and should be fixed to avoid confusion.
+    return RetainCount.isEnabled() ? RetainCount : OSObjectRetainCount;
+  }
+
   void printState(raw_ostream &Out, ProgramStateRef State,
                   const char *NL, const char *Sep) const override;
 
@@ -337,6 +327,8 @@ public:
   const RefCountBug &errorKindToBugKind(RefVal::Kind ErrorKind,
                                         SymbolRef Sym) const;
 
+  bool isReleaseUnownedError(RefVal::Kind ErrorKind) const;
+
   void processNonLeakError(ProgramStateRef St, SourceRange ErrorRange,
                            RefVal::Kind ErrorKind, SymbolRef Sym,
                            CheckerContext &C) const;
diff --git a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.cpp b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.cpp
index c9f5dc9..cad2c72 100644
--- a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.cpp
@@ -21,57 +21,6 @@ using namespace clang;
 using namespace ento;
 using namespace retaincountchecker;
 
-StringRef RefCountBug::bugTypeToName(RefCountBug::RefCountBugKind BT) {
-  switch (BT) {
-  case UseAfterRelease:
-    return "Use-after-release";
-  case ReleaseNotOwned:
-    return "Bad release";
-  case DeallocNotOwned:
-    return "-dealloc sent to non-exclusively owned object";
-  case FreeNotOwned:
-    return "freeing non-exclusively owned object";
-  case OverAutorelease:
-    return "Object autoreleased too many times";
-  case ReturnNotOwnedForOwned:
-    return "Method should return an owned object";
-  case LeakWithinFunction:
-    return "Leak";
-  case LeakAtReturn:
-    return "Leak of returned object";
-  }
-  llvm_unreachable("Unknown RefCountBugKind");
-}
-
-StringRef RefCountBug::getDescription() const {
-  switch (BT) {
-  case UseAfterRelease:
-    return "Reference-counted object is used after it is released";
-  case ReleaseNotOwned:
-    return "Incorrect decrement of the reference count of an object that is "
-           "not owned at this point by the caller";
-  case DeallocNotOwned:
-    return "-dealloc sent to object that may be referenced elsewhere";
-  case FreeNotOwned:
-    return  "'free' called on an object that may be referenced elsewhere";
-  case OverAutorelease:
-    return "Object autoreleased too many times";
-  case ReturnNotOwnedForOwned:
-    return "Object with a +0 retain count returned to caller where a +1 "
-           "(owning) retain count is expected";
-  case LeakWithinFunction:
-  case LeakAtReturn:
-    return "";
-  }
-  llvm_unreachable("Unknown RefCountBugKind");
-}
-
-RefCountBug::RefCountBug(CheckerNameRef Checker, RefCountBugKind BT)
-    : BugType(Checker, bugTypeToName(BT), categories::MemoryRefCount,
-              /*SuppressOnSink=*/BT == LeakWithinFunction ||
-                  BT == LeakAtReturn),
-      BT(BT) {}
-
 static bool isNumericLiteralExpression(const Expr *E) {
   // FIXME: This set of cases was copied from SemaExprObjC.
   return isa<IntegerLiteral, CharacterLiteral, FloatingLiteral,
@@ -312,9 +261,11 @@ namespace retaincountchecker {
 class RefCountReportVisitor : public BugReporterVisitor {
 protected:
   SymbolRef Sym;
+  bool IsReleaseUnowned;
 
 public:
-  RefCountReportVisitor(SymbolRef sym) : Sym(sym) {}
+  RefCountReportVisitor(SymbolRef S, bool IRU)
+      : Sym(S), IsReleaseUnowned(IRU) {}
 
   void Profile(llvm::FoldingSetNodeID &ID) const override {
     static int x = 0;
@@ -334,7 +285,8 @@ public:
 class RefLeakReportVisitor : public RefCountReportVisitor {
 public:
   RefLeakReportVisitor(SymbolRef Sym, const MemRegion *LastBinding)
-      : RefCountReportVisitor(Sym), LastBinding(LastBinding) {}
+      : RefCountReportVisitor(Sym, /*IsReleaseUnowned=*/false),
+        LastBinding(LastBinding) {}
 
   PathDiagnosticPieceRef getEndPath(BugReporterContext &BRC,
                                     const ExplodedNode *N,
@@ -452,12 +404,6 @@ annotateStartParameter(const ExplodedNode *N, SymbolRef Sym,
 PathDiagnosticPieceRef
 RefCountReportVisitor::VisitNode(const ExplodedNode *N, BugReporterContext &BRC,
                                  PathSensitiveBugReport &BR) {
-
-  const auto &BT = static_cast<const RefCountBug&>(BR.getBugType());
-
-  bool IsFreeUnowned = BT.getBugType() == RefCountBug::FreeNotOwned ||
-                       BT.getBugType() == RefCountBug::DeallocNotOwned;
-
   const SourceManager &SM = BRC.getSourceManager();
   CallEventManager &CEMgr = BRC.getStateManager().getCallEventManager();
   if (auto CE = N->getLocationAs<CallExitBegin>())
@@ -490,7 +436,7 @@ RefCountReportVisitor::VisitNode(const ExplodedNode *N, BugReporterContext &BRC,
   std::string sbuf;
   llvm::raw_string_ostream os(sbuf);
 
-  if (PrevT && IsFreeUnowned && CurrV.isNotOwned() && PrevT->isOwned()) {
+  if (PrevT && IsReleaseUnowned && CurrV.isNotOwned() && PrevT->isOwned()) {
     os << "Object is now not exclusively owned";
     auto Pos = PathDiagnosticLocation::create(N->getLocation(), SM);
     return std::make_shared<PathDiagnosticEventPiece>(Pos, sbuf);
@@ -815,10 +761,8 @@ RefLeakReportVisitor::getEndPath(BugReporterContext &BRC,
         if (K == ObjKind::ObjC || K == ObjKind::CF) {
           os << "whose name ('" << *FD
              << "') does not contain 'Copy' or 'Create'.  This violates the "
-                "naming"
-                " convention rules given in the Memory Management Guide for "
-                "Core"
-                " Foundation";
+                "naming convention rules given in the Memory Management Guide "
+                "for Core Foundation";
         } else if (RV->getObjKind() == ObjKind::OS) {
           std::string FuncName = FD->getNameAsString();
           os << "whose name ('" << FuncName << "') starts with '"
@@ -836,19 +780,20 @@ RefLeakReportVisitor::getEndPath(BugReporterContext &BRC,
 }
 
 RefCountReport::RefCountReport(const RefCountBug &D, const LangOptions &LOpts,
-                               ExplodedNode *n, SymbolRef sym, bool isLeak)
-    : PathSensitiveBugReport(D, D.getDescription(), n), Sym(sym),
+                               ExplodedNode *n, SymbolRef sym, bool isLeak,
+                               bool IsReleaseUnowned)
+    : PathSensitiveBugReport(D, D.getReportMessage(), n), Sym(sym),
       isLeak(isLeak) {
   if (!isLeak)
-    addVisitor<RefCountReportVisitor>(sym);
+    addVisitor<RefCountReportVisitor>(sym, IsReleaseUnowned);
 }
 
 RefCountReport::RefCountReport(const RefCountBug &D, const LangOptions &LOpts,
                                ExplodedNode *n, SymbolRef sym,
                                StringRef endText)
-    : PathSensitiveBugReport(D, D.getDescription(), endText, n) {
+    : PathSensitiveBugReport(D, D.getReportMessage(), endText, n) {
 
-  addVisitor<RefCountReportVisitor>(sym);
+  addVisitor<RefCountReportVisitor>(sym, /*IsReleaseUnowned=*/false);
 }
 
 void RefLeakReport::deriveParamLocation(CheckerContext &Ctx) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.h b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.h
index d059008..6ceb86f 100644
--- a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.h
@@ -25,25 +25,44 @@ namespace ento {
 namespace retaincountchecker {
 
 class RefCountBug : public BugType {
+  StringRef ReportMessage;
+
 public:
-  enum RefCountBugKind {
-    UseAfterRelease,
-    ReleaseNotOwned,
-    DeallocNotOwned,
-    FreeNotOwned,
-    OverAutorelease,
-    ReturnNotOwnedForOwned,
-    LeakWithinFunction,
-    LeakAtReturn,
-  };
-  RefCountBug(CheckerNameRef Checker, RefCountBugKind BT);
-  StringRef getDescription() const;
-
-  RefCountBugKind getBugType() const { return BT; }
-
-private:
-  RefCountBugKind BT;
-  static StringRef bugTypeToName(RefCountBugKind BT);
+  RefCountBug(const CheckerFrontend *CF, StringRef Desc, StringRef ReportMsg,
+              bool SuppressOnSink = false)
+      : BugType(CF, Desc, categories::MemoryRefCount, SuppressOnSink),
+        ReportMessage(ReportMsg) {}
+  StringRef getReportMessage() const { return ReportMessage; }
+};
+
+class RefCountFrontend : public CheckerFrontend {
+public:
+  const RefCountBug UseAfterRelease{
+      this, "Use-after-release",
+      "Reference-counted object is used after it is released"};
+  const RefCountBug ReleaseNotOwned{
+      this, "Bad release",
+      "Incorrect decrement of the reference count of an object that is not "
+      "owned at this point by the caller"};
+  const RefCountBug DeallocNotOwned{
+      this, "-dealloc sent to non-exclusively owned object",
+      "-dealloc sent to object that may be referenced elsewhere"};
+  const RefCountBug FreeNotOwned{
+      this, "freeing non-exclusively owned object",
+      "'free' called on an object that may be referenced elsewhere"};
+  const RefCountBug OverAutorelease{this, "Object autoreleased too many times",
+                                    "Object autoreleased too many times"};
+  const RefCountBug ReturnNotOwnedForOwned{
+      this, "Method should return an owned object",
+      "Object with a +0 retain count returned to caller where a +1 (owning) "
+      "retain count is expected"};
+  // For these two bug types the report message will be generated dynamically
+  // by `RefLeakReport::createDescription` so the empty string taken from the
+  // BugType will be ignored (overwritten).
+  const RefCountBug LeakWithinFunction{this, "Leak", /*ReportMsg=*/"",
+                                       /*SuppressOnSink=*/true};
+  const RefCountBug LeakAtReturn{this, "Leak of returned object",
+                                 /*ReportMsg=*/"", /*SuppressOnSink=*/true};
 };
 
 class RefCountReport : public PathSensitiveBugReport {
@@ -53,8 +72,8 @@ protected:
 
 public:
   RefCountReport(const RefCountBug &D, const LangOptions &LOpts,
-              ExplodedNode *n, SymbolRef sym,
-              bool isLeak=false);
+                 ExplodedNode *n, SymbolRef sym, bool isLeak = false,
+                 bool IsReleaseUnowned = false);
 
   RefCountReport(const RefCountBug &D, const LangOptions &LOpts,
               ExplodedNode *n, SymbolRef sym,
diff --git a/clang/test/AST/ByteCode/cxx11.cpp b/clang/test/AST/ByteCode/cxx11.cpp
index 7aecf23b..08caca0 100644
--- a/clang/test/AST/ByteCode/cxx11.cpp
+++ b/clang/test/AST/ByteCode/cxx11.cpp
@@ -330,3 +330,33 @@ namespace ReadMutableInCopyCtor {
                        // both-note {{read of mutable member 'u'}} \
                        // both-note {{in call to 'G(g1)'}}
 }
+
+namespace GH150709 {
+  struct C { };
+  struct D : C {
+    constexpr int f() const { return 1; };
+  };
+  struct E : C { };
+  struct F : D { };
+  struct G : E { };
+  
+  constexpr C c1, c2[2];
+  constexpr D d1, d2[2];
+  constexpr E e1, e2[2];
+  constexpr F f;
+  constexpr G g;
+
+  constexpr auto mp = static_cast<int (C::*)() const>(&D::f);
+
+  // sanity checks for fix of GH150709 (unchanged behavior)
+  static_assert((c1.*mp)() == 1, ""); // both-error {{constant expression}}
+  static_assert((d1.*mp)() == 1, "");
+  static_assert((f.*mp)() == 1, "");
+  static_assert((c2[0].*mp)() == 1, ""); // ref-error {{constant expression}}
+  static_assert((d2[0].*mp)() == 1, "");
+
+  // incorrectly undiagnosed before fix of GH150709
+  static_assert((e1.*mp)() == 1, ""); // ref-error {{constant expression}}
+  static_assert((e2[0].*mp)() == 1, ""); // ref-error {{constant expression}}
+  static_assert((g.*mp)() == 1, ""); // ref-error {{constant expression}}
+}
diff --git a/clang/test/AST/ByteCode/cxx2a.cpp b/clang/test/AST/ByteCode/cxx2a.cpp
index ac2f988..744c99e 100644
--- a/clang/test/AST/ByteCode/cxx2a.cpp
+++ b/clang/test/AST/ByteCode/cxx2a.cpp
@@ -225,3 +225,17 @@ namespace Dtor {
   static_assert(pseudo(true, false)); // both-error {{constant expression}} both-note {{in call}}
   static_assert(pseudo(false, true));
 }
+
+namespace GH150705 {
+  struct A { };
+  struct B : A { };
+  struct C : A {
+    constexpr virtual int foo() const { return 0; }
+  };
+
+  constexpr auto p = &C::foo;
+  constexpr auto q = static_cast<int (A::*)() const>(p);
+  constexpr B b;
+  constexpr const A& a = b;
+  constexpr auto x = (a.*q)(); // both-error {{constant expression}}
+}
diff --git a/clang/test/CIR/CodeGen/complex-compound-assignment.cpp b/clang/test/CIR/CodeGen/complex-compound-assignment.cpp
index 35a8aa6..8245025 100644
--- a/clang/test/CIR/CodeGen/complex-compound-assignment.cpp
+++ b/clang/test/CIR/CodeGen/complex-compound-assignment.cpp
@@ -286,3 +286,211 @@ void foo4() {
 // CXX_OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 1
 // CXX_OGCG: store i32 %[[B_REAL]], ptr %[[C_REAL_PTR]], align 4
 // CXX_OGCG: store i32 %[[B_IMAG]], ptr %[[C_IMAG_PTR]], align 4
+
+void foo5() {
+  float _Complex a;
+  float b;
+  a += b;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["b"]
+// CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!cir.float>, !cir.float
+// CIR: %[[CONST_ZERO:.*]] = cir.const #cir.fp<0.000000e+00> : !cir.float
+// CIR: %[[COMPLEX_B:.*]] = cir.complex.create %[[TMP_B]], %[[CONST_ZERO]] : !cir.float -> !cir.complex<!cir.float>
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR: %[[RESULT:.*]] = cir.complex.add %[[TMP_A]], %[[COMPLEX_B]] : !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[RESULT]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca float, i64 1, align 4
+// LLVM: %[[TMP_B:.*]] = load float, ptr %[[B_ADDR]], align 4
+// LLVM: %[[TMP_COMPLEX_B:.*]] = insertvalue { float, float } {{.*}}, float %[[TMP_B]], 0
+// LLVM: %[[COMPLEX_B:.*]] = insertvalue { float, float } %[[TMP_COMPLEX_B]], float 0.000000e+00, 1
+// LLVM: %[[TMP_A:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
+// LLVM: %[[A_REAL:.*]] = extractvalue { float, float } %[[TMP_A]], 0
+// LLVM: %[[A_IMAG:.*]] = extractvalue { float, float } %[[TMP_A]], 1
+// LLVM: %[[B_REAL:.*]] = extractvalue { float, float } %[[COMPLEX_B]], 0
+// LLVM: %[[B_IMAG:.*]] = extractvalue { float, float } %[[COMPLEX_B]], 1
+// LLVM: %[[ADD_REAL:.*]] = fadd float %[[A_REAL]], %[[B_REAL]]
+// LLVM: %[[ADD_IMAG:.*]] = fadd float %[[A_IMAG]], %[[B_IMAG]]
+// LLVM: %[[TMP_RESULT:.*]] = insertvalue { float, float } poison, float %[[ADD_REAL]], 0
+// LLVM: %[[RESULT:.*]] = insertvalue { float, float } %[[TMP_RESULT]], float %[[ADD_IMAG]], 1
+// LLVM: store { float, float } %[[RESULT]], ptr %[[A_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca float, align 4
+// OGCG: %[[TMP_B:.*]] = load float, ptr %[[B_ADDR]], align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[ADD_REAL:.*]] = fadd float %[[A_REAL]], %[[TMP_B]]
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: store float %[[ADD_REAL]], ptr %[[A_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[A_IMAG_PTR]], align 4
+
+void foo6() {
+  int _Complex a;
+  int _Complex b;
+  b *= a;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["b"]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CIR: %[[B_REAL:.*]] = cir.complex.real %[[TMP_B]] : !cir.complex<!s32i> -> !s32i
+// CIR: %[[B_IMAG:.*]] = cir.complex.imag %[[TMP_B]] : !cir.complex<!s32i> -> !s32i
+// CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!s32i> -> !s32i
+// CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!s32i> -> !s32i
+// CIR: %[[MUL_BR_AR:.*]] = cir.binop(mul, %[[B_REAL]], %[[A_REAL]]) : !s32i
+// CIR: %[[MUL_BI_AI:.*]] = cir.binop(mul, %[[B_IMAG]], %[[A_IMAG]]) : !s32i
+// CIR: %[[MUL_BR_AI:.*]] = cir.binop(mul, %[[B_REAL]], %[[A_IMAG]]) : !s32i
+// CIR: %[[MUL_BI_AR:.*]] = cir.binop(mul, %[[B_IMAG]], %[[A_REAL]]) : !s32i
+// CIR: %[[RESULT_REAL:.*]] = cir.binop(sub, %[[MUL_BR_AR]], %[[MUL_BI_AI]]) : !s32i
+// CIR: %[[RESULT_IMAG:.*]] = cir.binop(add, %[[MUL_BR_AI]], %[[MUL_BI_AR]]) : !s32i
+// CIR: %[[RESULT:.*]] = cir.complex.create %[[RESULT_REAL]], %[[RESULT_IMAG]] : !s32i -> !cir.complex<!s32i>
+// CIR: cir.store{{.*}} %[[RESULT]], %[[B_ADDR]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load { i32, i32 }, ptr %[[A_ADDR]], align 4
+// LLVM: %[[TMP_B:.*]] = load { i32, i32 }, ptr %[[B_ADDR]], align 4
+// LLVM: %[[B_REAL:.*]] = extractvalue { i32, i32 } %[[TMP_B]], 0
+// LLVM: %[[B_IMAG:.*]] = extractvalue { i32, i32 } %[[TMP_B]], 1
+// LLVM: %[[A_REAL:.*]] = extractvalue { i32, i32 } %[[TMP_A]], 0
+// LLVM: %[[A_IMAG:.*]] = extractvalue { i32, i32 } %[[TMP_A]], 1
+// LLVM: %[[MUL_BR_AR:.*]] = mul i32 %[[B_REAL]], %[[A_REAL]]
+// LLVM: %[[MUL_BI_AI:.*]] = mul i32 %[[B_IMAG]], %[[A_IMAG]]
+// LLVM: %[[MUL_BR_AI:.*]] = mul i32 %[[B_REAL]], %[[A_IMAG]]
+// LLVM: %[[MUL_BI_AR:.*]] = mul i32 %[[B_IMAG]], %[[A_REAL]]
+// LLVM: %[[RESULT_REAL:.*]] = sub i32 %[[MUL_BR_AR]], %[[MUL_BI_AI]]
+// LLVM: %[[RESULT_IMAG:.*]] = add i32 %[[MUL_BR_AI]], %[[MUL_BI_AR]]
+// LLVM: %[[MUL_A_B:.*]] = insertvalue { i32, i32 } {{.*}}, i32 %[[RESULT_REAL]], 0
+// LLVM: %[[RESULT:.*]] = insertvalue { i32, i32 } %[[MUL_A_B]], i32 %[[RESULT_IMAG]], 1
+// LLVM: store { i32, i32 } %[[RESULT]], ptr %[[B_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load i32, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load i32, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[B_REAL:.*]] = load i32, ptr %[[B_REAL_PTR]], align 4
+// OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: %[[B_IMAG:.*]] = load i32, ptr %[[B_IMAG_PTR]], align 4
+// OGCG: %[[MUL_BR_AR:.*]] = mul i32 %[[B_REAL]], %[[A_REAL]]
+// OGCG: %[[MUL_BI_AI:.*]] = mul i32 %[[B_IMAG]], %[[A_IMAG]]
+// OGCG: %[[RESULT_REAL:.*]] = sub i32 %[[MUL_BR_AR]], %[[MUL_BI_AI]]
+// OGCG: %[[MUL_BI_AR:.*]] = mul i32 %[[B_IMAG]], %[[A_REAL]]
+// OGCG: %[[MUL_BR_AI:.*]] = mul i32 %[[B_REAL]], %[[A_IMAG]]
+// OGCG: %[[RESULT_IMAG:.*]] = add i32 %[[MUL_BI_AR]], %[[MUL_BR_AI]]
+// OGCG: %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: store i32 %[[RESULT_REAL]], ptr %[[B_REAL_PTR]], align 4
+// OGCG: store i32 %[[RESULT_IMAG]], ptr %[[B_IMAG_PTR]], align 4
+
+void foo7() {
+  float _Complex a;
+  float _Complex b;
+  b *= a;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b"]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR: %[[B_REAL:.*]] = cir.complex.real %[[TMP_B]] : !cir.complex<!cir.float> -> !cir.float
+// CIR: %[[B_IMAG:.*]] = cir.complex.imag %[[TMP_B]] : !cir.complex<!cir.float> -> !cir.float
+// CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
+// CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
+// CIR: %[[MUL_BR_AR:.*]] = cir.binop(mul, %[[B_REAL]], %[[A_REAL]]) : !cir.float
+// CIR: %[[MUL_BI_AI:.*]] = cir.binop(mul, %[[B_IMAG]], %[[A_IMAG]]) : !cir.float
+// CIR: %[[MUL_BR_AI:.*]] = cir.binop(mul, %[[B_REAL]], %[[A_IMAG]]) : !cir.float
+// CIR: %[[MUL_BI_AR:.*]] = cir.binop(mul, %[[B_IMAG]], %[[A_REAL]]) : !cir.float
+// CIR: %[[C_REAL:.*]] = cir.binop(sub, %[[MUL_BR_AR]], %[[MUL_BI_AI]]) : !cir.float
+// CIR: %[[C_IMAG:.*]] = cir.binop(add, %[[MUL_BR_AI]], %[[MUL_BI_AR]]) : !cir.float
+// CIR: %[[COMPLEX:.*]] = cir.complex.create %[[C_REAL]], %[[C_IMAG]] : !cir.float -> !cir.complex<!cir.float>
+// CIR: %[[IS_C_REAL_NAN:.*]] = cir.cmp(ne, %[[C_REAL]], %[[C_REAL]]) : !cir.float, !cir.bool
+// CIR: %[[IS_C_IMAG_NAN:.*]] = cir.cmp(ne, %[[C_IMAG]], %[[C_IMAG]]) : !cir.float, !cir.bool
+// CIR: %[[CONST_FALSE:.*]] = cir.const #false
+// CIR: %[[SELECT_CONDITION:.*]] = cir.select if %[[IS_C_REAL_NAN]] then %[[IS_C_IMAG_NAN]] else %[[CONST_FALSE]] : (!cir.bool, !cir.bool, !cir.bool) -> !cir.bool
+// CIR: %[[RESULT:.*]] = cir.ternary(%[[SELECT_CONDITION]], true {
+// CIR:   %[[LIBC_COMPLEX:.*]] = cir.call @__mulsc3(%[[B_REAL]], %[[B_IMAG]], %[[A_REAL]], %[[A_IMAG]]) : (!cir.float, !cir.float, !cir.float, !cir.float) -> !cir.complex<!cir.float>
+// CIR:   cir.yield %[[LIBC_COMPLEX]] : !cir.complex<!cir.float>
+// CIR: }, false {
+// CIR:   cir.yield %[[COMPLEX]] : !cir.complex<!cir.float>
+// CIR: }) : (!cir.bool) -> !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[RESULT]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
+// LLVM: %[[TMP_B:.*]] = load { float, float }, ptr %[[B_ADDR]], align 4
+// LLVM: %[[B_REAL:.*]] = extractvalue { float, float } %[[TMP_B]], 0
+// LLVM: %[[B_IMAG:.*]] = extractvalue { float, float } %[[TMP_B]], 1
+// LLVM: %[[A_REAL:.*]] = extractvalue { float, float } %[[TMP_A]], 0
+// LLVM: %[[A_IMAG:.*]] = extractvalue { float, float } %[[TMP_A]], 1
+// LLVM: %[[MUL_BR_AR:.*]] = fmul float %[[B_REAL]], %[[A_REAL]]
+// LLVM: %[[MUL_BI_AI:.*]] = fmul float %[[B_IMAG]], %[[A_IMAG]]
+// LLVM: %[[MUL_BR_AI:.*]] = fmul float %[[B_REAL]], %[[A_IMAG]]
+// LLVM: %[[MUL_BI_AR:.*]] = fmul float %[[B_IMAG]], %[[A_REAL]]
+// LLVM: %[[C_REAL:.*]] = fsub float %[[MUL_BR_AR]], %[[MUL_BI_AI]]
+// LLVM: %[[C_IMAG:.*]] = fadd float %[[MUL_BR_AI]], %[[MUL_BI_AR]]
+// LLVM: %[[MUL_A_B:.*]] = insertvalue { float, float } {{.*}}, float %[[C_REAL]], 0
+// LLVM: %[[COMPLEX:.*]] = insertvalue { float, float } %[[MUL_A_B]], float %[[C_IMAG]], 1
+// LLVM: %[[IS_C_REAL_NAN:.*]] = fcmp une float %[[C_REAL]], %[[C_REAL]]
+// LLVM: %[[IS_C_IMAG_NAN:.*]] = fcmp une float %[[C_IMAG]], %[[C_IMAG]]
+// LLVM: %[[SELECT_CONDITION:.*]] = and i1 %[[IS_C_REAL_NAN]], %[[IS_C_IMAG_NAN]]
+// LLVM: br i1 %[[SELECT_CONDITION]], label %[[THEN_LABEL:.*]], label %[[ELSE_LABEL:.*]]
+// LLVM: [[THEN_LABEL]]:
+// LLVM:  %[[LIBC_COMPLEX:.*]] = call { float, float } @__mulsc3(float %[[B_REAL]], float %[[B_IMAG]], float %[[A_REAL]], float %[[A_IMAG]])
+// LLVM:  br label %[[PHI_BRANCH:.*]]
+// LLVM: [[ELSE_LABEL]]:
+// LLVM:  br label %[[PHI_BRANCH:]]
+// LLVM: [[PHI_BRANCH:]]:
+// LLVM:  %[[RESULT:.*]] = phi { float, float } [ %[[COMPLEX]], %[[ELSE_LABEL]] ], [ %[[LIBC_COMPLEX]], %[[THEN_LABEL]] ]
+// LLVM:  br label %[[END_LABEL:.*]]
+// LLVM: [[END_LABEL]]:
+// LLVM:  store { float, float } %[[RESULT]], ptr %[[B_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[COMPLEX_CALL_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[B_REAL:.*]] = load float, ptr %[[B_REAL_PTR]], align 4
+// OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: %[[B_IMAG:.*]] = load float, ptr %[[B_IMAG_PTR]], align 4
+// OGCG: %[[MUL_BR_AR:.*]] = fmul float %[[B_REAL]], %[[A_REAL]]
+// OGCG: %[[MUL_BI_AI:.*]] = fmul float %[[B_IMAG]], %[[A_IMAG]]
+// OGCG: %[[MUL_BR_AI:.*]] = fmul float %[[B_REAL]], %[[A_IMAG]]
+// OGCG: %[[MUL_BI_AR:.*]] = fmul float %[[B_IMAG]], %[[A_REAL]]
+// OGCG: %[[C_REAL:.*]] = fsub float %[[MUL_BR_AR]], %[[MUL_BI_AI]]
+// OGCG: %[[C_IMAG:.*]] = fadd float %[[MUL_BR_AI]], %[[MUL_BI_AR]]
+// OGCG: %[[IS_C_REAL_NAN:.*]] = fcmp uno float %[[C_REAL]], %[[C_REAL]]
+// OGCG: br i1 %[[IS_C_REAL_NAN]], label %[[COMPLEX_IS_IMAG_NAN:.*]], label %[[END_LABEL:.*]], !prof !2
+// OGCG: [[COMPLEX_IS_IMAG_NAN]]:
+// OGCG:  %[[IS_C_IMAG_NAN:.*]] = fcmp uno float %[[C_IMAG]], %[[C_IMAG]]
+// OGCG:  br i1 %[[IS_C_IMAG_NAN]], label %[[COMPLEX_LIB_CALL:.*]], label %[[END_LABEL]], !prof !2
+// OGCG: [[COMPLEX_LIB_CALL]]:
+// OGCG:  %[[CALL_RESULT:.*]] = call{{.*}} <2 x float> @__mulsc3(float noundef %[[B_REAL]], float noundef %[[B_IMAG]], float noundef %[[A_REAL]], float noundef %[[A_IMAG]])
+// OGCG:  store <2 x float> %[[CALL_RESULT]], ptr %[[COMPLEX_CALL_ADDR]], align 4
+// OGCG:  %[[COMPLEX_CALL_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX_CALL_ADDR]], i32 0, i32 0
+// OGCG:  %[[COMPLEX_CALL_REAL:.*]] = load float, ptr %[[COMPLEX_CALL_REAL_PTR]], align 4
+// OGCG:  %[[COMPLEX_CALL_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX_CALL_ADDR]], i32 0, i32 1
+// OGCG:  %[[COMPLEX_CALL_IMAG:.*]] = load float, ptr %[[COMPLEX_CALL_IMAG_PTR]], align 4
+// OGCG:  br label %[[END_LABEL]]
+// OGCG: [[END_LABEL]]:
+// OGCG:  %[[FINAL_REAL:.*]] = phi float [ %[[C_REAL]], %[[ENTRY:.*]] ], [ %[[C_REAL]], %[[COMPLEX_IS_IMAG_NAN]] ], [ %[[COMPLEX_CALL_REAL]], %[[COMPLEX_LIB_CALL]] ]
+// OGCG:  %[[FINAL_IMAG:.*]] = phi float [ %[[C_IMAG]], %[[ENTRY]] ], [ %[[C_IMAG]], %[[COMPLEX_IS_IMAG_NAN]] ], [ %[[COMPLEX_CALL_IMAG]], %[[COMPLEX_LIB_CALL]] ]
+// OGCG:  %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG:  %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG:  store float %[[FINAL_REAL]], ptr %[[C_REAL_PTR]], align 4
+// OGCG:  store float %[[FINAL_IMAG]], ptr %[[C_IMAG_PTR]], align 4
diff --git a/clang/test/CIR/CodeGen/module-asm.c b/clang/test/CIR/CodeGen/module-asm.c
new file mode 100644
index 0000000..e6cec5e
--- /dev/null
+++ b/clang/test/CIR/CodeGen/module-asm.c
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s -o %t.cir 
+// RUN: FileCheck --input-file=%t.cir %s
+
+// CHECK:  cir.module_asm = [".globl bar", ".globl foo"]
+__asm (".globl bar");
+__asm (".globl foo");
diff --git a/clang/test/CIR/IR/invalid-vtable.cir b/clang/test/CIR/IR/invalid-vtable.cir
new file mode 100644
index 0000000..b3afb581
--- /dev/null
+++ b/clang/test/CIR/IR/invalid-vtable.cir
@@ -0,0 +1,9 @@
+// RUN: cir-opt %s -verify-diagnostics
+
+!s8i = !cir.int<s, 8>
+!u32i = !cir.int<u, 32>
+cir.func @reference_unknown_vtable() {
+  // expected-error @below {{'cir.vtable.address_point' op 'some_vtable' does not reference a valid cir.global}}
+  %0 = cir.vtable.address_point(@some_vtable, address_point = <index = 0, offset = 2>) : !cir.vptr
+  cir.return
+}
diff --git a/clang/test/CIR/IR/vtable-addrpt.cir b/clang/test/CIR/IR/vtable-addrpt.cir
new file mode 100644
index 0000000..0b809cc
--- /dev/null
+++ b/clang/test/CIR/IR/vtable-addrpt.cir
@@ -0,0 +1,23 @@
+// RUN: cir-opt %s | FileCheck %s
+
+// Test the parsing and printing of a constructor that uses a vtable addess_point op.
+
+!u32i = !cir.int<u, 32>
+!u8i = !cir.int<u, 8>
+!rec_anon_struct = !cir.record<struct  {!cir.array<!cir.ptr<!u8i> x 4>}>
+!rec_S = !cir.record<struct "S" {!cir.vptr}>
+
+module {
+  cir.global "private" external @_ZTV1S : !rec_anon_struct {alignment = 8 : i64}
+  cir.func @_ZN1SC2Ev(%arg0: !cir.ptr<!rec_S>) {
+    %0 = cir.alloca !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>, ["this", init] {alignment = 8 : i64}
+    cir.store %arg0, %0 : !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>
+    %1 = cir.load %0 : !cir.ptr<!cir.ptr<!rec_S>>, !cir.ptr<!rec_S>
+    %2 = cir.vtable.address_point(@_ZTV1S, address_point = <index = 0, offset = 2>) : !cir.vptr
+    %3 = cir.cast(bitcast, %1 : !cir.ptr<!rec_S>), !cir.ptr<!cir.vptr>
+    cir.store align(8) %2, %3 : !cir.vptr, !cir.ptr<!cir.vptr>
+    cir.return
+  }
+}
+
+// CHECK: cir.vtable.address_point(@_ZTV1S, address_point = <index = 0, offset = 2>) : !cir.vptr
diff --git a/clang/test/CIR/Lowering/module-asm.cir b/clang/test/CIR/Lowering/module-asm.cir
new file mode 100644
index 0000000..b802cda
--- /dev/null
+++ b/clang/test/CIR/Lowering/module-asm.cir
@@ -0,0 +1,11 @@
+// RUN: cir-opt %s -cir-to-llvm -o %t.cir
+// RUN: FileCheck %s --input-file=%t.cir
+
+// RUN: cir-translate -cir-to-llvmir --disable-cc-lowering -o %t.ll %s
+// RUN: FileCheck -check-prefix=LLVM --input-file=%t.ll %s
+
+// CHECK: llvm.module_asm =  [".globl bar", ".globl foo"]
+// LLVM: module asm ".globl bar"
+// LLVM: module asm ".globl foo"
+module attributes {cir.module_asm = [".globl bar", ".globl foo"]} {
+}
diff --git a/clang/test/CodeGen/WebAssembly/builtins-test-fp-sig.c b/clang/test/CodeGen/WebAssembly/builtins-test-fp-sig.c
new file mode 100644
index 0000000..88447f7
--- /dev/null
+++ b/clang/test/CodeGen/WebAssembly/builtins-test-fp-sig.c
@@ -0,0 +1,70 @@
+// RUN: %clang_cc1 -triple wasm32-unknown-unknown -target-feature +gc -O3 -emit-llvm -DSINGLE_VALUE -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY-SV
+// RUN: %clang_cc1 -triple wasm64-unknown-unknown -target-feature +gc -O3 -emit-llvm -DSINGLE_VALUE -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY-SV
+// RUN: %clang_cc1 -triple wasm64-unknown-unknown -target-feature +gc -target-abi experimental-mv -O3 -emit-llvm  -o - %s 2>&1 | FileCheck %s -check-prefixes WEBASSEMBLY
+// RUN: not %clang_cc1 -triple wasm64-unknown-unknown -O3 -emit-llvm -o - %s 2>&1 | FileCheck %s -check-prefixes MISSING-GC
+
+void use(int);
+
+typedef void (*Fvoid)(void);
+void test_function_pointer_signature_void(Fvoid func) {
+  // MISSING-GC: error: '__builtin_wasm_test_function_pointer_signature' needs target feature gc
+  // WEBASSEMBLY:  %0 = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, token poison)
+  use(__builtin_wasm_test_function_pointer_signature(func));
+}
+
+typedef float (*Ffloats)(float, double, int);
+void test_function_pointer_signature_floats(Ffloats func) {
+  // WEBASSEMBLY:  %0 = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, float poison, token poison, float poison, double poison, i32 poison)
+  use(__builtin_wasm_test_function_pointer_signature(func));
+}
+
+typedef void (*Fpointers)(Fvoid, Ffloats, void*, int*, int***, char[5]);
+void test_function_pointer_signature_pointers(Fpointers func) {
+  // WEBASSEMBLY:  %0 = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, token poison, ptr poison, ptr poison, ptr poison, ptr poison, ptr poison, ptr poison)
+  use(__builtin_wasm_test_function_pointer_signature(func));
+}
+
+typedef void (*FVarArgs)(int, ...);
+void test_function_pointer_signature_varargs(FVarArgs func) {
+  // WEBASSEMBLY:  %0 = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, token poison, i32 poison, ptr poison)
+  use(__builtin_wasm_test_function_pointer_signature(func));
+}
+
+typedef __externref_t (*FExternRef)(__externref_t, __externref_t);
+void test_function_pointer_externref(FExternRef func) {
+  // WEBASSEMBLY:  %0 = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, ptr addrspace(10) poison, token poison, ptr addrspace(10) poison, ptr addrspace(10) poison)
+  use(__builtin_wasm_test_function_pointer_signature(func));
+}
+
+typedef __funcref Fpointers (*FFuncRef)(__funcref Fvoid, __funcref Ffloats);
+void test_function_pointer_funcref(FFuncRef func) {
+  // WEBASSEMBLY:  %0 = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, ptr addrspace(20) poison, token poison, ptr addrspace(20) poison, ptr addrspace(20) poison)
+  use(__builtin_wasm_test_function_pointer_signature(func));
+}
+
+#ifdef SINGLE_VALUE
+// Some tests that we get struct ABIs correct. There is no special code in
+// __builtin_wasm_test_function_pointer_signature for this, it gets handled by
+// the normal type lowering code.
+// Single element structs are unboxed, multi element structs are passed on
+// stack.
+typedef struct {double x;} (*Fstructs1)(struct {double x;}, struct {float x;}, struct {double x; float y;});
+void test_function_pointer_structs1(Fstructs1 func) {
+  // WEBASSEMBLY-SV:  %0 = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, double poison, token poison, double poison, float poison, ptr poison)
+  use(__builtin_wasm_test_function_pointer_signature(func));
+}
+
+// Two element return struct ==> return ptr on stack
+typedef struct {double x; double y;} (*Fstructs2)(void);
+void test_function_pointer_structs2(Fstructs2 func) {
+  // WEBASSEMBLY-SV:  %0 = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, token poison, ptr poison)
+  use(__builtin_wasm_test_function_pointer_signature(func));
+}
+
+// Return union ==> return ptr on stack, one element union => unboxed
+typedef union {double x; float y;} (*FUnions)(union {double x; float y;}, union {double x;});
+void test_function_pointer_unions(FUnions func) {
+  // WEBASSEMBLY-SV:  %0 = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, token poison, ptr poison, ptr poison, double poison)
+  use(__builtin_wasm_test_function_pointer_signature(func));
+}
+#endif
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 2d43764..e2c9f96 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -20,12 +20,14 @@ __m256d test_mm256_add_pd(__m256d A, __m256d B) {
   // CHECK: fadd <4 x double>
   return _mm256_add_pd(A, B);
 }
+TEST_CONSTEXPR(match_m256d( _mm256_add_pd((__m256d){-4.0, -5.0, +6.0, +7.0}, (__m256d){-4.0, -5.0, +6.0, +7.0}), -8.0, -10.0, +12.0, +14.0));
 
 __m256 test_mm256_add_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_add_ps
   // CHECK: fadd <8 x float>
   return _mm256_add_ps(A, B);
 }
+TEST_CONSTEXPR(match_m256(_mm256_add_ps((__m256){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m256){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}), -8.0f, -10.0f, +12.0f, +14.0f, +14.0f, +12.0f, -10.0f, -8.0f));
 
 __m256d test_mm256_addsub_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_addsub_pd
@@ -977,12 +979,14 @@ __m256d test_mm256_div_pd(__m256d A, __m256d B) {
   // CHECK: fdiv <4 x double>
   return _mm256_div_pd(A, B);
 }
+TEST_CONSTEXPR(match_m256d( _mm256_div_pd((__m256d){-4.0, -5.0, +6.0, +7.0}, (__m256d){-1.0, +1.0, -1.0, +1.0}), +4.0, -5.0, -6.0, +7.0));
 
 __m256 test_mm256_div_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_div_ps
   // CHECK: fdiv <8 x float>
   return _mm256_div_ps(A, B);
 }
+TEST_CONSTEXPR(match_m256( _mm256_div_ps((__m256){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m256){-1.0f, +1.0f, -1.0f, +1.0f, +1.0f, -1.0f, +1.0f, -1.0f}), +4.0f, -5.0f, -6.0f, +7.0f, +7.0f, -6.0f, -5.0f, +4.0f));
 
 __m256 test_mm256_dp_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_dp_ps
@@ -1295,12 +1299,14 @@ __m256d test_mm256_mul_pd(__m256d A, __m256d B) {
   // CHECK: fmul <4 x double>
   return _mm256_mul_pd(A, B);
 }
+TEST_CONSTEXPR(match_m256d( _mm256_mul_pd((__m256d){-4.0, -5.0, +6.0, +7.0}, (__m256d){-4.0, -5.0, +6.0, +7.0}), +16.0, +25.0, +36.0, +49.0));
 
 __m256 test_mm256_mul_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_mul_ps
   // CHECK: fmul <8 x float>
   return _mm256_mul_ps(A, B);
 }
+TEST_CONSTEXPR(match_m256( _mm256_mul_ps((__m256){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m256){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}), +16.0f, +25.0f, +36.0f, +49.0f, +49.0f, +36.0f, +25.0f, +16.0f));
 
 __m256d test_mm256_or_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_or_pd
@@ -1933,12 +1939,14 @@ __m256d test_mm256_sub_pd(__m256d A, __m256d B) {
   // CHECK: fsub <4 x double>
   return _mm256_sub_pd(A, B);
 }
+TEST_CONSTEXPR(match_m256d( _mm256_sub_pd((__m256d){-4.0, -5.0, +6.0, +7.0}, (__m256d){-0.0, +0.0, +2.0, -1.0}), -4.0, -5.0, 4.0, 8.0));
 
 __m256 test_mm256_sub_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_sub_ps
   // CHECK: fsub <8 x float>
   return _mm256_sub_ps(A, B);
 }
+TEST_CONSTEXPR(match_m256( _mm256_sub_ps((__m256){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m256){-0.0f, +0.0f, +2.0f, -1.0f, -1.0f, +2.0f, +0.0f, -0.0f}), -4.0f, -5.0f, 4.0f, 8.0f, 8.0f, 4.0f, -5.0f, -4.0f));
 
 int test_mm_testc_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_testc_pd
@@ -2062,24 +2070,28 @@ __m256d test_mm256_unpackhi_pd(__m256d A, __m256d B) {
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   return _mm256_unpackhi_pd(A, B);
 }
+TEST_CONSTEXPR(match_m256d(_mm256_unpackhi_pd((__m256d){+1.0, +2.0, +3.0, +4.0}, (__m256d){+5.0, +6.0, +7.0, +8.0}), +2.0, +6.0, +4.0, +8.0));
 
 __m256 test_mm256_unpackhi_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_unpackhi_ps
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   return _mm256_unpackhi_ps(A, B);
 }
+TEST_CONSTEXPR(match_m256(_mm256_unpackhi_ps((__m256){+0.0f, +1.0f, +2.0f, +3.0f, +4.0f, +5.0f, +6.0f, +7.0f}, (__m256){+10.0f, +11.0f, +12.0f, +13.0f, +14.0f, +15.0f, +16.0f, +17.0f}), +2.0f, +12.0f, +3.0f, +13.0f, +6.0f, +16.0f, +7.0f, +17.0f));
 
 __m256d test_mm256_unpacklo_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_unpacklo_pd
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   return _mm256_unpacklo_pd(A, B);
 }
+TEST_CONSTEXPR(match_m256d(_mm256_unpacklo_pd((__m256d){+1.0, +2.0, +3.0, +4.0}, (__m256d){+5.0, +6.0, +7.0, +8.0}), +1.0, +5.0, +3.0, +7.0));
 
 __m256 test_mm256_unpacklo_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_unpacklo_ps
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   return _mm256_unpacklo_ps(A, B);
 }
+TEST_CONSTEXPR(match_m256(_mm256_unpacklo_ps((__m256){+0.0f, +1.0f, +2.0f, +3.0f, +4.0f, +5.0f, +6.0f, +7.0f}, (__m256){+10.0f, +11.0f, +12.0f, +13.0f, +14.0f, +15.0f, +16.0f, +17.0f}), +0.0f, +10.0f, +1.0f, +11.0f, +4.0f, +14.0f, +5.0f, +15.0f));
 
 __m256d test_mm256_xor_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_xor_pd
diff --git a/clang/test/CodeGen/X86/avx512-reduceIntrin.c b/clang/test/CodeGen/X86/avx512-reduceIntrin.c
index 2ceac3a..4069b46 100644
--- a/clang/test/CodeGen/X86/avx512-reduceIntrin.c
+++ b/clang/test/CodeGen/X86/avx512-reduceIntrin.c
@@ -1,162 +1,174 @@
-// RUN: %clang_cc1 -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -O0 -triple=i386-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -O0 -triple=i386-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 long long test_mm512_reduce_add_epi64(__m512i __W){
-// CHECK-LABEL: @test_mm512_reduce_add_epi64(
-// CHECK:    call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_add_epi64
+// CHECK:    call {{.*}}i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %{{.*}})
   return _mm512_reduce_add_epi64(__W);
 }
+TEST_CONSTEXPR(_mm512_reduce_add_epi64((__m512i)(__v8di){-4, -3, -2, -1, 0, 1, 2, 3}) == -4);
 
 long long test_mm512_reduce_mul_epi64(__m512i __W){
-// CHECK-LABEL: @test_mm512_reduce_mul_epi64(
-// CHECK:    call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_mul_epi64
+// CHECK:    call {{.*}}i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> %{{.*}})
   return _mm512_reduce_mul_epi64(__W);
 }
+TEST_CONSTEXPR(_mm512_reduce_mul_epi64((__m512i)(__v8di){1, 2, 3, 4, 5, 6, 7, 8}) == 40320);
 
 long long test_mm512_reduce_or_epi64(__m512i __W){
-// CHECK-LABEL: @test_mm512_reduce_or_epi64(
-// CHECK:    call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_or_epi64
+// CHECK:    call {{.*}}i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %{{.*}})
   return _mm512_reduce_or_epi64(__W);
 }
+TEST_CONSTEXPR(_mm512_reduce_or_epi64((__m512i)(__v8di){0x100, 0x200, 0x400, 0x800, 0, 0, 0, 0}) == 0xF00);
 
 long long test_mm512_reduce_and_epi64(__m512i __W){
-// CHECK-LABEL: @test_mm512_reduce_and_epi64(
-// CHECK:    call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_and_epi64
+// CHECK:    call {{.*}}i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %{{.*}})
   return _mm512_reduce_and_epi64(__W);
 }
+TEST_CONSTEXPR(_mm512_reduce_and_epi64((__m512i)(__v8di){0xFFFF, 0xFF00, 0x00FF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFF00, 0x00FF}) == 0x0000);
 
 long long test_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_add_epi64(
+// CHECK-LABEL: test_mm512_mask_reduce_add_epi64
 // CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-// CHECK:    call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %{{.*}})
+// CHECK:    call {{.*}}i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %{{.*}})
   return _mm512_mask_reduce_add_epi64(__M, __W);
 }
 
 long long test_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_mul_epi64(
+// CHECK-LABEL: test_mm512_mask_reduce_mul_epi64
 // CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-// CHECK:    call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> %{{.*}})
+// CHECK:    call {{.*}}i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> %{{.*}})
   return _mm512_mask_reduce_mul_epi64(__M, __W);
 }
 
 long long test_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_and_epi64(
+// CHECK-LABEL: test_mm512_mask_reduce_and_epi64
 // CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-// CHECK:    call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %{{.*}})
+// CHECK:    call {{.*}}i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %{{.*}})
   return _mm512_mask_reduce_and_epi64(__M, __W);
 }
 
 long long test_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_or_epi64(
+// CHECK-LABEL: test_mm512_mask_reduce_or_epi64
 // CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-// CHECK:    call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %{{.*}})
+// CHECK:    call {{.*}}i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %{{.*}})
   return _mm512_mask_reduce_or_epi64(__M, __W);
 }
 
 int test_mm512_reduce_add_epi32(__m512i __W){
-// CHECK-LABEL: @test_mm512_reduce_add_epi32(
-// CHECK:    call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_add_epi32
+// CHECK:    call {{.*}}i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %{{.*}})
   return _mm512_reduce_add_epi32(__W);
 }
+TEST_CONSTEXPR(_mm512_reduce_add_epi32((__m512i)(__v16si){-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7}) == -8);
 
 int test_mm512_reduce_mul_epi32(__m512i __W){
-// CHECK-LABEL: @test_mm512_reduce_mul_epi32(
-// CHECK:    call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_mul_epi32
+// CHECK:    call {{.*}}i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %{{.*}})
   return _mm512_reduce_mul_epi32(__W);
 }
+TEST_CONSTEXPR(_mm512_reduce_mul_epi32((__m512i)(__v16si){1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 1, 1, -3, 1, 1}) == -36);
 
 int test_mm512_reduce_or_epi32(__m512i __W){
-// CHECK:    call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %{{.*}})
+// CHECK:    call {{.*}}i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %{{.*}})
   return _mm512_reduce_or_epi32(__W);
 }
+TEST_CONSTEXPR(_mm512_reduce_or_epi32((__m512i)(__v16si){0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0, 0, 0, 0, 0, 0, 0, 0}) == 0xFF);
 
 int test_mm512_reduce_and_epi32(__m512i __W){
-// CHECK-LABEL: @test_mm512_reduce_and_epi32(
-// CHECK:    call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_and_epi32
+// CHECK:    call {{.*}}i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %{{.*}})
   return _mm512_reduce_and_epi32(__W);
 }
+TEST_CONSTEXPR(_mm512_reduce_and_epi32((__m512i)(__v16si){0xFF, 0xF0, 0x0F, 0xFF, 0xFF, 0xFF, 0xF0, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xF0, 0xF0, 0x0F, 0x0F}) == 0x00);
 
 int test_mm512_mask_reduce_add_epi32(__mmask16 __M, __m512i __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_add_epi32(
+// CHECK-LABEL: test_mm512_mask_reduce_add_epi32
 // CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-// CHECK:    call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %{{.*}})
+// CHECK:    call {{.*}}i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %{{.*}})
   return _mm512_mask_reduce_add_epi32(__M, __W);
 }
 
 int test_mm512_mask_reduce_mul_epi32(__mmask16 __M, __m512i __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_mul_epi32(
+// CHECK-LABEL: test_mm512_mask_reduce_mul_epi32
 // CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-// CHECK:    call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %{{.*}})
+// CHECK:    call {{.*}}i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %{{.*}})
   return _mm512_mask_reduce_mul_epi32(__M, __W);
 }
 
 int test_mm512_mask_reduce_and_epi32(__mmask16 __M, __m512i __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_and_epi32(
+// CHECK-LABEL: test_mm512_mask_reduce_and_epi32
 // CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-// CHECK:    call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %{{.*}})
+// CHECK:    call {{.*}}i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %{{.*}})
   return _mm512_mask_reduce_and_epi32(__M, __W);
 }
 
 int test_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_or_epi32(
+// CHECK-LABEL: test_mm512_mask_reduce_or_epi32
 // CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-// CHECK:    call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %{{.*}})
+// CHECK:    call {{.*}}i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %{{.*}})
   return _mm512_mask_reduce_or_epi32(__M, __W);
 }
 
 double test_mm512_reduce_add_pd(__m512d __W, double ExtraAddOp){
-// CHECK-LABEL: @test_mm512_reduce_add_pd(
+// CHECK-LABEL: test_mm512_reduce_add_pd
 // CHECK-NOT: reassoc
-// CHECK:    call reassoc double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> %{{.*}})
+// CHECK:    call reassoc {{.*}}double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> %{{.*}})
 // CHECK-NOT: reassoc
   return _mm512_reduce_add_pd(__W) + ExtraAddOp;
 }
 
 double test_mm512_reduce_mul_pd(__m512d __W, double ExtraMulOp){
-// CHECK-LABEL: @test_mm512_reduce_mul_pd(
+// CHECK-LABEL: test_mm512_reduce_mul_pd
 // CHECK-NOT: reassoc
-// CHECK:    call reassoc double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
+// CHECK:    call reassoc {{.*}}double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
 // CHECK-NOT: reassoc
   return _mm512_reduce_mul_pd(__W) * ExtraMulOp;
 }
 
 float test_mm512_reduce_add_ps(__m512 __W){
-// CHECK-LABEL: @test_mm512_reduce_add_ps(
-// CHECK:    call reassoc float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_add_ps
+// CHECK:    call reassoc {{.*}}float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> %{{.*}})
   return _mm512_reduce_add_ps(__W);
 }
 
 float test_mm512_reduce_mul_ps(__m512 __W){
-// CHECK-LABEL: @test_mm512_reduce_mul_ps(
-// CHECK:    call reassoc float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_mul_ps
+// CHECK:    call reassoc {{.*}}float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
   return _mm512_reduce_mul_ps(__W);
 }
 
 double test_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_add_pd(
+// CHECK-LABEL: test_mm512_mask_reduce_add_pd
 // CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-// CHECK:    call reassoc double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> %{{.*}})
+// CHECK:    call reassoc {{.*}}double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> %{{.*}})
   return _mm512_mask_reduce_add_pd(__M, __W);
 }
 
 double test_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_mul_pd(
+// CHECK-LABEL: test_mm512_mask_reduce_mul_pd
 // CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-// CHECK:    call reassoc double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
+// CHECK:    call reassoc {{.*}}double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
   return _mm512_mask_reduce_mul_pd(__M, __W);
 }
 
 float test_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_add_ps(
+// CHECK-LABEL: test_mm512_mask_reduce_add_ps
 // CHECK:    select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> {{.*}}
-// CHECK:    call reassoc float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> %{{.*}})
+// CHECK:    call reassoc {{.*}}float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> %{{.*}})
   return _mm512_mask_reduce_add_ps(__M, __W);
 }
 
 float test_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_mul_ps(
+// CHECK-LABEL: test_mm512_mask_reduce_mul_ps
 // CHECK:    select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> %{{.*}}
-// CHECK:    call reassoc float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
+// CHECK:    call reassoc {{.*}}float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
   return _mm512_mask_reduce_mul_ps(__M, __W);
 }
diff --git a/clang/test/CodeGen/X86/avx512-reduceMinMaxIntrin.c b/clang/test/CodeGen/X86/avx512-reduceMinMaxIntrin.c
index 3e33ec5..0110079 100644
--- a/clang/test/CodeGen/X86/avx512-reduceMinMaxIntrin.c
+++ b/clang/test/CodeGen/X86/avx512-reduceMinMaxIntrin.c
@@ -1,164 +1,175 @@
-// RUN: %clang_cc1 -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -O0 -triple=i386-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -O0 -triple=i386-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 long long test_mm512_reduce_max_epi64(__m512i __W){
-// CHECK-LABEL: @test_mm512_reduce_max_epi64(
-// CHECK:    call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_max_epi64
+// CHECK:    call {{.*}}i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %{{.*}})
   return _mm512_reduce_max_epi64(__W);
 }
+TEST_CONSTEXPR(_mm512_reduce_max_epi64((__m512i)(__v8di){-4, -3, -2, -1, 0, 1, 2, 3}) == 3);
 
 unsigned long long test_mm512_reduce_max_epu64(__m512i __W){
-// CHECK-LABEL: @test_mm512_reduce_max_epu64(
-// CHECK:    call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_max_epu64
+// CHECK:    call {{.*}}i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %{{.*}})
   return _mm512_reduce_max_epu64(__W);
 }
+TEST_CONSTEXPR(_mm512_reduce_max_epu64((__m512i)(__v8du){0, 1, 2, 3, 4, 5, 6, 7}) == 7);
 
 double test_mm512_reduce_max_pd(__m512d __W, double ExtraAddOp){
-// CHECK-LABEL: @test_mm512_reduce_max_pd(
+// CHECK-LABEL: test_mm512_reduce_max_pd
 // CHECK-NOT: nnan
-// CHECK:    call nnan double @llvm.vector.reduce.fmax.v8f64(<8 x double> %{{.*}})
+// CHECK:    call nnan {{.*}}double @llvm.vector.reduce.fmax.v8f64(<8 x double> %{{.*}})
 // CHECK-NOT: nnan
   return _mm512_reduce_max_pd(__W) + ExtraAddOp;
 }
 
 long long test_mm512_reduce_min_epi64(__m512i __W){
-// CHECK-LABEL: @test_mm512_reduce_min_epi64(
-// CHECK:    call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_min_epi64
+// CHECK:    call {{.*}}i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %{{.*}})
   return _mm512_reduce_min_epi64(__W);
 }
+TEST_CONSTEXPR(_mm512_reduce_min_epi64((__m512i)(__v8di){-4, -3, -2, -1, 0, 1, 2, 3}) == -4);
 
 unsigned long long test_mm512_reduce_min_epu64(__m512i __W){
-// CHECK-LABEL: @test_mm512_reduce_min_epu64(
-// CHECK:    call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_min_epu64
+// CHECK:    call {{.*}}i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %{{.*}})
   return _mm512_reduce_min_epu64(__W);
 }
+TEST_CONSTEXPR(_mm512_reduce_min_epu64((__m512i)(__v8du){0, 1, 2, 3, 4, 5, 6, 7}) == 0);
 
 double test_mm512_reduce_min_pd(__m512d __W, double ExtraMulOp){
-// CHECK-LABEL: @test_mm512_reduce_min_pd(
+// CHECK-LABEL: test_mm512_reduce_min_pd
 // CHECK-NOT: nnan
-// CHECK:    call nnan double @llvm.vector.reduce.fmin.v8f64(<8 x double> %{{.*}})
+// CHECK:    call nnan {{.*}}double @llvm.vector.reduce.fmin.v8f64(<8 x double> %{{.*}})
 // CHECK-NOT: nnan
   return _mm512_reduce_min_pd(__W) * ExtraMulOp;
 }
 
 long long test_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_max_epi64(
+// CHECK-LABEL: test_mm512_mask_reduce_max_epi64
 // CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-// CHECK:    call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %{{.*}})
+// CHECK:    call {{.*}}i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %{{.*}})
   return _mm512_mask_reduce_max_epi64(__M, __W);
 }
 
 unsigned long test_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_max_epu64(
+// CHECK-LABEL: test_mm512_mask_reduce_max_epu64
 // CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-// CHECK:    call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %{{.*}})
+// CHECK:    call {{.*}}i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %{{.*}})
   return _mm512_mask_reduce_max_epu64(__M, __W);
 }
 
 double test_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_max_pd(
+// CHECK-LABEL: test_mm512_mask_reduce_max_pd
 // CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-// CHECK:    call nnan double @llvm.vector.reduce.fmax.v8f64(<8 x double> %{{.*}})
+// CHECK:    call nnan {{.*}}double @llvm.vector.reduce.fmax.v8f64(<8 x double> %{{.*}})
   return _mm512_mask_reduce_max_pd(__M, __W);
 }
 
 long long test_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_min_epi64(
+// CHECK-LABEL: test_mm512_mask_reduce_min_epi64
 // CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-// CHECK:    call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %{{.*}})
+// CHECK:    call {{.*}}i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %{{.*}})
   return _mm512_mask_reduce_min_epi64(__M, __W);
 }
 
 unsigned long long test_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_min_epu64(
+// CHECK-LABEL: test_mm512_mask_reduce_min_epu64
 // CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-// CHECK:    call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %{{.*}})
+// CHECK:    call {{.*}}i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %{{.*}})
   return _mm512_mask_reduce_min_epu64(__M, __W);
 }
 
 double test_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_min_pd(
+// CHECK-LABEL: test_mm512_mask_reduce_min_pd
 // CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-// CHECK:    call nnan double @llvm.vector.reduce.fmin.v8f64(<8 x double> %{{.*}})
+// CHECK:    call nnan {{.*}}double @llvm.vector.reduce.fmin.v8f64(<8 x double> %{{.*}})
   return _mm512_mask_reduce_min_pd(__M, __W);
 }
 
 int test_mm512_reduce_max_epi32(__m512i __W){
-// CHECK-LABEL: @test_mm512_reduce_max_epi32(
-// CHECK:    call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_max_epi32
+// CHECK:    call {{.*}}i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %{{.*}})
   return _mm512_reduce_max_epi32(__W);
 }
+TEST_CONSTEXPR(_mm512_reduce_max_epi32((__m512i)(__v16si){-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7}) == 7);
 
 unsigned int test_mm512_reduce_max_epu32(__m512i __W){
-// CHECK-LABEL: @test_mm512_reduce_max_epu32(
-// CHECK:    call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_max_epu32
+// CHECK:    call {{.*}}i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %{{.*}})
   return _mm512_reduce_max_epu32(__W);
 }
+TEST_CONSTEXPR(_mm512_reduce_max_epu32((__m512i)(__v16su){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}) == 15);
 
 float test_mm512_reduce_max_ps(__m512 __W){
-// CHECK-LABEL: @test_mm512_reduce_max_ps(
-// CHECK:    call nnan float @llvm.vector.reduce.fmax.v16f32(<16 x float> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_max_ps
+// CHECK:    call nnan {{.*}}float @llvm.vector.reduce.fmax.v16f32(<16 x float> %{{.*}})
   return _mm512_reduce_max_ps(__W);
 }
 
 int test_mm512_reduce_min_epi32(__m512i __W){
-// CHECK-LABEL: @test_mm512_reduce_min_epi32(
-// CHECK:    call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_min_epi32
+// CHECK:    call {{.*}}i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %{{.*}})
   return _mm512_reduce_min_epi32(__W);
 }
+TEST_CONSTEXPR(_mm512_reduce_min_epi32((__m512i)(__v16si){-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7}) == -8);
 
 unsigned int test_mm512_reduce_min_epu32(__m512i __W){
-// CHECK-LABEL: @test_mm512_reduce_min_epu32(
-// CHECK:    call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_min_epu32
+// CHECK:    call {{.*}}i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %{{.*}})
   return _mm512_reduce_min_epu32(__W);
 }
+TEST_CONSTEXPR(_mm512_reduce_min_epu32((__m512i)(__v16su){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}) == 0);
 
 float test_mm512_reduce_min_ps(__m512 __W){
-// CHECK-LABEL: @test_mm512_reduce_min_ps(
-// CHECK:    call nnan float @llvm.vector.reduce.fmin.v16f32(<16 x float> %{{.*}})
+// CHECK-LABEL: test_mm512_reduce_min_ps
+// CHECK:    call nnan {{.*}}float @llvm.vector.reduce.fmin.v16f32(<16 x float> %{{.*}})
   return _mm512_reduce_min_ps(__W);
 }
 
 int test_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_max_epi32(
+// CHECK-LABEL: test_mm512_mask_reduce_max_epi32
 // CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-// CHECK:    call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %{{.*}})
+// CHECK:    call {{.*}}i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %{{.*}})
   return _mm512_mask_reduce_max_epi32(__M, __W);
 }
 
 unsigned int test_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_max_epu32(
+// CHECK-LABEL: test_mm512_mask_reduce_max_epu32
 // CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-// CHECK:    call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %{{.*}})
+// CHECK:    call {{.*}}i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %{{.*}})
   return _mm512_mask_reduce_max_epu32(__M, __W);
 }
 
 float test_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_max_ps(
+// CHECK-LABEL: test_mm512_mask_reduce_max_ps
 // CHECK:    select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-// CHECK:    call nnan float @llvm.vector.reduce.fmax.v16f32(<16 x float> %{{.*}})
+// CHECK:    call nnan {{.*}}float @llvm.vector.reduce.fmax.v16f32(<16 x float> %{{.*}})
   return _mm512_mask_reduce_max_ps(__M, __W);
 }
 
 int test_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_min_epi32(
+// CHECK-LABEL: test_mm512_mask_reduce_min_epi32
 // CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-// CHECK:    call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %{{.*}})
+// CHECK:    call {{.*}}i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %{{.*}})
   return _mm512_mask_reduce_min_epi32(__M, __W);
 }
 
 unsigned int test_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_min_epu32(
+// CHECK-LABEL: test_mm512_mask_reduce_min_epu32
 // CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-// CHECK:    call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %{{.*}})
+// CHECK:    call {{.*}}i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %{{.*}})
   return _mm512_mask_reduce_min_epu32(__M, __W);
 }
 
 float test_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __W){
-// CHECK-LABEL: @test_mm512_mask_reduce_min_ps(
+// CHECK-LABEL: test_mm512_mask_reduce_min_ps
 // CHECK:    select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-// CHECK:    call nnan float @llvm.vector.reduce.fmin.v16f32(<16 x float> %{{.*}})
+// CHECK:    call nnan {{.*}}float @llvm.vector.reduce.fmin.v16f32(<16 x float> %{{.*}})
   return _mm512_mask_reduce_min_ps(__M, __W);
 }
-
diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c
index 39fb92d..1c01695 100644
--- a/clang/test/CodeGen/X86/avx512dq-builtins.c
+++ b/clang/test/CodeGen/X86/avx512dq-builtins.c
@@ -1,18 +1,21 @@
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512dq -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512dq -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512dq -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512dq -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512dq -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 
 #include <immintrin.h>
 #include "builtin_test_helpers.h"
 
 __mmask8 test_knot_mask8(__mmask8 a) {
-  // CHECK-LABEL: @test_knot_mask8
+  // CHECK-LABEL: test_knot_mask8
   // CHECK: [[IN:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[NOT:%.*]] = xor <8 x i1> [[IN]], splat (i1 true)
   return _knot_mask8(a);
 }
 
 __mmask8 test_kand_mask8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kand_mask8
+  // CHECK-LABEL: test_kand_mask8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RES:%.*]] = and <8 x i1> [[LHS]], [[RHS]]
@@ -22,7 +25,7 @@ __mmask8 test_kand_mask8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m
 }
 
 __mmask8 test_kandn_mask8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kandn_mask8
+  // CHECK-LABEL: test_kandn_mask8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[NOT:%.*]] = xor <8 x i1> [[LHS]], splat (i1 true)
@@ -33,7 +36,7 @@ __mmask8 test_kandn_mask8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __
 }
 
 __mmask8 test_kor_mask8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kor_mask8
+  // CHECK-LABEL: test_kor_mask8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RES:%.*]] = or <8 x i1> [[LHS]], [[RHS]]
@@ -43,7 +46,7 @@ __mmask8 test_kor_mask8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m5
 }
 
 __mmask8 test_kxnor_mask8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kxnor_mask8
+  // CHECK-LABEL: test_kxnor_mask8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[NOT:%.*]] = xor <8 x i1> [[LHS]], splat (i1 true)
@@ -54,7 +57,7 @@ __mmask8 test_kxnor_mask8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __
 }
 
 __mmask8 test_kxor_mask8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kxor_mask8
+  // CHECK-LABEL: test_kxor_mask8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RES:%.*]] = xor <8 x i1> [[LHS]], [[RHS]]
@@ -64,7 +67,7 @@ __mmask8 test_kxor_mask8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m
 }
 
 unsigned char test_kortestz_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_kortestz_mask8_u8
+  // CHECK-LABEL: test_kortestz_mask8_u8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[OR:%.*]] = or <8 x i1> [[LHS]], [[RHS]]
@@ -77,7 +80,7 @@ unsigned char test_kortestz_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m5
 }
 
 unsigned char test_kortestc_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_kortestc_mask8_u8
+  // CHECK-LABEL: test_kortestc_mask8_u8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[OR:%.*]] = or <8 x i1> [[LHS]], [[RHS]]
@@ -90,7 +93,7 @@ unsigned char test_kortestc_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m5
 }
 
 unsigned char test_kortest_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
-  // CHECK-LABEL: @test_kortest_mask8_u8
+  // CHECK-LABEL: test_kortest_mask8_u8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[OR:%.*]] = or <8 x i1> [[LHS]], [[RHS]]
@@ -110,7 +113,7 @@ unsigned char test_kortest_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m51
 }
 
 unsigned char test_ktestz_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_ktestz_mask8_u8
+  // CHECK-LABEL: test_ktestz_mask8_u8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RES:%.*]] = call i32 @llvm.x86.avx512.ktestz.b(<8 x i1> [[LHS]], <8 x i1> [[RHS]])
@@ -120,7 +123,7 @@ unsigned char test_ktestz_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512
 }
 
 unsigned char test_ktestc_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_ktestc_mask8_u8
+  // CHECK-LABEL: test_ktestc_mask8_u8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RES:%.*]] = call i32 @llvm.x86.avx512.ktestc.b(<8 x i1> [[LHS]], <8 x i1> [[RHS]])
@@ -130,7 +133,7 @@ unsigned char test_ktestc_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512
 }
 
 unsigned char test_ktest_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
-  // CHECK-LABEL: @test_ktest_mask8_u8
+  // CHECK-LABEL: test_ktest_mask8_u8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RES:%.*]] = call i32 @llvm.x86.avx512.ktestc.b(<8 x i1> [[LHS]], <8 x i1> [[RHS]])
@@ -144,7 +147,7 @@ unsigned char test_ktest_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i
 }
 
 unsigned char test_ktestz_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_ktestz_mask16_u8
+  // CHECK-LABEL: test_ktestz_mask16_u8
   // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
   // CHECK: [[RES:%.*]] = call i32 @llvm.x86.avx512.ktestz.w(<16 x i1> [[LHS]], <16 x i1> [[RHS]])
@@ -154,7 +157,7 @@ unsigned char test_ktestz_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m51
 }
 
 unsigned char test_ktestc_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_ktestc_mask16_u8
+  // CHECK-LABEL: test_ktestc_mask16_u8
   // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
   // CHECK: [[RES:%.*]] = call i32 @llvm.x86.avx512.ktestc.w(<16 x i1> [[LHS]], <16 x i1> [[RHS]])
@@ -164,7 +167,7 @@ unsigned char test_ktestc_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m51
 }
 
 unsigned char test_ktest_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
-  // CHECK-LABEL: @test_ktest_mask16_u8
+  // CHECK-LABEL: test_ktest_mask16_u8
   // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
   // CHECK: [[RES:%.*]] = call i32 @llvm.x86.avx512.ktestc.w(<16 x i1> [[LHS]], <16 x i1> [[RHS]])
@@ -178,7 +181,7 @@ unsigned char test_ktest_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512
 }
 
 __mmask8 test_kadd_mask8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kadd_mask8
+  // CHECK-LABEL: test_kadd_mask8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RES:%.*]] = call <8 x i1> @llvm.x86.avx512.kadd.b(<8 x i1> [[LHS]], <8 x i1> [[RHS]])
@@ -188,7 +191,7 @@ __mmask8 test_kadd_mask8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m
 }
 
 __mmask16 test_kadd_mask16(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kadd_mask16
+  // CHECK-LABEL: test_kadd_mask16
   // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
   // CHECK: [[RES:%.*]] = call <16 x i1> @llvm.x86.avx512.kadd.w(<16 x i1> [[LHS]], <16 x i1> [[RHS]])
@@ -198,71 +201,71 @@ __mmask16 test_kadd_mask16(__m512i __A, __m512i __B, __m512i __C, __m512i __D, _
 }
 
 __mmask8 test_kshiftli_mask8(__m512i A, __m512i B, __m512i C, __m512i D) {
-  // CHECK-LABEL: @test_kshiftli_mask8
+  // CHECK-LABEL: test_kshiftli_mask8
   // CHECK: [[VAL:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RES:%.*]] = shufflevector <8 x i1> zeroinitializer, <8 x i1> [[VAL]], <8 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
   return _mm512_mask_cmpneq_epu64_mask(_kshiftli_mask8(_mm512_cmpneq_epu64_mask(A, B), 2), C, D);
 }
 
 __mmask8 test_kshiftri_mask8(__m512i A, __m512i B, __m512i C, __m512i D) {
-  // CHECK-LABEL: @test_kshiftri_mask8
+  // CHECK-LABEL: test_kshiftri_mask8
   // CHECK: [[VAL:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: [[RES:%.*]] = shufflevector <8 x i1> [[VAL]], <8 x i1> zeroinitializer, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
   return _mm512_mask_cmpneq_epu64_mask(_kshiftri_mask8(_mm512_cmpneq_epu64_mask(A, B), 2), C, D);
 }
 
 unsigned int test_cvtmask8_u32(__m512i A, __m512i B) {
-  // CHECK-LABEL: @test_cvtmask8_u32
+  // CHECK-LABEL: test_cvtmask8_u32
   // CHECK: zext i8 %{{.*}} to i32
   return _cvtmask8_u32(_mm512_cmpneq_epu64_mask(A, B));
 }
 
 __mmask8 test_cvtu32_mask8(__m512i A, __m512i B, unsigned int C) {
-  // CHECK-LABEL: @test_cvtu32_mask8
+  // CHECK-LABEL: test_cvtu32_mask8
   // CHECK: trunc i32 %{{.*}} to i8
   return _mm512_mask_cmpneq_epu64_mask(_cvtu32_mask8(C), A, B);
 }
 
 __mmask8 test_load_mask8(__mmask8 *A, __m512i B, __m512i C) {
-  // CHECK-LABEL: @test_load_mask8
+  // CHECK-LABEL: test_load_mask8
   // CHECK: [[LOAD:%.*]] = load i8, ptr %{{.*}}
   return _mm512_mask_cmpneq_epu64_mask(_load_mask8(A), B, C);
 }
 
 void test_store_mask8(__mmask8 *A, __m512i B, __m512i C) {
-  // CHECK-LABEL: @test_store_mask8
+  // CHECK-LABEL: test_store_mask8
   // CHECK: store i8 %{{.*}}, ptr %{{.*}}
   _store_mask8(A, _mm512_cmpneq_epu64_mask(B, C));
 }
 
 __m512i test_mm512_mullo_epi64 (__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mullo_epi64
+  // CHECK-LABEL: test_mm512_mullo_epi64
   // CHECK: mul <8 x i64>
   return (__m512i) _mm512_mullo_epi64(__A, __B);
 }
 
 __m512i test_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_mullo_epi64
+  // CHECK-LABEL: test_mm512_mask_mullo_epi64
   // CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return (__m512i) _mm512_mask_mullo_epi64(__W, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_mullo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_mullo_epi64
+  // CHECK-LABEL: test_mm512_maskz_mullo_epi64
   // CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return (__m512i) _mm512_maskz_mullo_epi64(__U, __A, __B);
 }
 
 __m512d test_mm512_xor_pd (__m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_xor_pd
+  // CHECK-LABEL: test_mm512_xor_pd
   // CHECK: xor <8 x i64>
   return (__m512d) _mm512_xor_pd(__A, __B);
 }
 
 __m512d test_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_xor_pd
+  // CHECK-LABEL: test_mm512_mask_xor_pd
   // CHECK: xor <8 x i64>
   // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}}
@@ -271,7 +274,7 @@ __m512d test_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d
 TEST_CONSTEXPR(match_m512d(_mm512_xor_pd((__m512d){-4.0, -5.0, +6.0, +7.0, +7.0, +6.0, -5.0, -4.0}, (__m512d){+0.0, -0.0, -0.0, +7.0, +7.0, -0.0, -0.0, +0.0}), -4.0, +5.0, -6.0, +0.0, +0.0, -6.0, +5.0, -4.0));
 
 __m512d test_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_xor_pd
+  // CHECK-LABEL: test_mm512_maskz_xor_pd
   // CHECK: xor <8 x i64>
   // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}}
@@ -279,14 +282,14 @@ __m512d test_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) {
 }
 
 __m512 test_mm512_xor_ps (__m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_xor_ps
+  // CHECK-LABEL: test_mm512_xor_ps
   // CHECK: xor <16 x i32>
   return (__m512) _mm512_xor_ps(__A, __B);
 }
-TEST_CONSTEXPR(match_m512(_mm512_xor_ps((__m256){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f, -4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m256){+0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f}), -4.0f, +5.0f, -6.0f, +0.0f, +0.0f, -6.0f, +5.0f, -4.0f, -4.0f, +5.0f, -6.0f, +0.0f, +0.0f, -6.0f, +5.0f, -4.0f));
+TEST_CONSTEXPR(match_m512(_mm512_xor_ps((__m512){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f, -4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m512){+0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f}), -4.0f, +5.0f, -6.0f, +0.0f, +0.0f, -6.0f, +5.0f, -4.0f, -4.0f, +5.0f, -6.0f, +0.0f, +0.0f, -6.0f, +5.0f, -4.0f));
 
 __m512 test_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_xor_ps
+  // CHECK-LABEL: test_mm512_mask_xor_ps
   // CHECK: xor <16 x i32>
   // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
   // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}}
@@ -294,7 +297,7 @@ __m512 test_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B
 }
 
 __m512 test_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_xor_ps
+  // CHECK-LABEL: test_mm512_maskz_xor_ps
   // CHECK: xor <16 x i32>
   // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
   // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}}
@@ -302,14 +305,14 @@ __m512 test_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) {
 }
 
 __m512d test_mm512_or_pd (__m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_or_pd
+  // CHECK-LABEL: test_mm512_or_pd
   // CHECK: or <8 x i64>
   return (__m512d) _mm512_or_pd(__A, __B);
 }
 TEST_CONSTEXPR(match_m512d(_mm512_or_pd((__m512d){-4.0, -5.0, +6.0, +7.0, +7.0, +6.0, -5.0, -4.0}, (__m512d){+0.0, -0.0, -0.0, +7.0, +7.0, -0.0, -0.0, +0.0}), -4.0, -5.0, -6.0, +7.0, +7.0, -6.0, -5.0, -4.0));
 
 __m512d test_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_or_pd
+  // CHECK-LABEL: test_mm512_mask_or_pd
   // CHECK: or <8 x i64>
   // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}}
@@ -317,7 +320,7 @@ __m512d test_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d _
 }
 
 __m512d test_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_or_pd
+  // CHECK-LABEL: test_mm512_maskz_or_pd
   // CHECK: or <8 x i64>
   // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}}
@@ -325,14 +328,14 @@ __m512d test_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) {
 }
 
 __m512 test_mm512_or_ps (__m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_or_ps
+  // CHECK-LABEL: test_mm512_or_ps
   // CHECK: or <16 x i32>
   return (__m512) _mm512_or_ps(__A, __B);
 }
-TEST_CONSTEXPR(match_m512(_mm512_or_ps((__m256){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f, -4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m256){+0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f}), -4.0f, -5.0f, -6.0f, +7.0f, +7.0f, -6.0f, -5.0f, -4.0f, -4.0f, -5.0f, -6.0f, +7.0f, +7.0f, -6.0f, -5.0f, -4.0f));
+TEST_CONSTEXPR(match_m512(_mm512_or_ps((__m512){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f, -4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m512){+0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f}), -4.0f, -5.0f, -6.0f, +7.0f, +7.0f, -6.0f, -5.0f, -4.0f, -4.0f, -5.0f, -6.0f, +7.0f, +7.0f, -6.0f, -5.0f, -4.0f));
 
 __m512 test_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_or_ps
+  // CHECK-LABEL: test_mm512_mask_or_ps
   // CHECK: or <16 x i32>
   // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
   // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}}
@@ -340,7 +343,7 @@ __m512 test_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 }
 
 __m512 test_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_or_ps
+  // CHECK-LABEL: test_mm512_maskz_or_ps
   // CHECK: or <16 x i32>
   // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
   // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}}
@@ -348,14 +351,14 @@ __m512 test_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) {
 }
 
 __m512d test_mm512_and_pd (__m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_and_pd
+  // CHECK-LABEL: test_mm512_and_pd
   // CHECK: and <8 x i64>
   return (__m512d) _mm512_and_pd(__A, __B);
 }
 TEST_CONSTEXPR(match_m512d(_mm512_and_pd((__m512d){-4.0, -5.0, +6.0, +7.0, +7.0, +6.0, -5.0, -4.0}, (__m512d){+0.0, -0.0, -0.0, +7.0, +7.0, -0.0, -0.0, +0.0}), -0.0, -0.0, +0.0, +7.0, +7.0, +0.0, -0.0, -0.0));
 
 __m512d test_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_and_pd
+  // CHECK-LABEL: test_mm512_mask_and_pd
   // CHECK: and <8 x i64>
   // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}}
@@ -363,7 +366,7 @@ __m512d test_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d
 }
 
 __m512d test_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_and_pd
+  // CHECK-LABEL: test_mm512_maskz_and_pd
   // CHECK: and <8 x i64>
   // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}}
@@ -371,14 +374,14 @@ __m512d test_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) {
 }
 
 __m512 test_mm512_and_ps (__m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_and_ps
+  // CHECK-LABEL: test_mm512_and_ps
   // CHECK: and <16 x i32>
   return (__m512) _mm512_and_ps(__A, __B);
 }
-TEST_CONSTEXPR(match_m512(_mm512_and_ps((__m256){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f, -4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m256){+0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f}), -0.0f, -0.0f, +0.0f, +7.0f, +7.0f, +0.0f, -0.0f, -0.0f, -0.0f, -0.0f, +0.0f, +7.0f, +7.0f, +0.0f, -0.0f, -0.0f));
+TEST_CONSTEXPR(match_m512(_mm512_and_ps((__m512){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f, -4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m512){+0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f}), -0.0f, -0.0f, +0.0f, +7.0f, +7.0f, +0.0f, -0.0f, -0.0f, -0.0f, -0.0f, +0.0f, +7.0f, +7.0f, +0.0f, -0.0f, -0.0f));
 
 __m512 test_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_and_ps
+  // CHECK-LABEL: test_mm512_mask_and_ps
   // CHECK: and <16 x i32>
   // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
   // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}}
@@ -386,7 +389,7 @@ __m512 test_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B
 }
 
 __m512 test_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_and_ps
+  // CHECK-LABEL: test_mm512_maskz_and_ps
   // CHECK: and <16 x i32>
   // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
   // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}}
@@ -394,7 +397,7 @@ __m512 test_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) {
 }
 
 __m512d test_mm512_andnot_pd (__m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_andnot_pd
+  // CHECK-LABEL: test_mm512_andnot_pd
   // CHECK: xor <8 x i64> %{{.*}}, splat (i64 -1)
   // CHECK: and <8 x i64>
   return (__m512d) _mm512_andnot_pd(__A, __B);
@@ -402,7 +405,7 @@ __m512d test_mm512_andnot_pd (__m512d __A, __m512d __B) {
 TEST_CONSTEXPR(match_m512d(_mm512_andnot_pd((__m512d){-4.0, -5.0, +6.0, +7.0, +7.0, +6.0, -5.0, -4.0}, (__m512d){+0.0, -0.0, -0.0, +7.0, +7.0, -0.0, -0.0, +0.0}), +0.0, +0.0, +0.0, +0.0, +0.0, +0.0, +0.0, +0.0));
 
 __m512d test_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_andnot_pd
+  // CHECK-LABEL: test_mm512_mask_andnot_pd
   // CHECK: xor <8 x i64> %{{.*}}, splat (i64 -1)
   // CHECK: and <8 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
@@ -410,7 +413,7 @@ __m512d test_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A, __m51
 }
 
 __m512d test_mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_andnot_pd
+  // CHECK-LABEL: test_mm512_maskz_andnot_pd
   // CHECK: xor <8 x i64> %{{.*}}, splat (i64 -1)
   // CHECK: and <8 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
@@ -418,15 +421,15 @@ __m512d test_mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B) {
 }
 
 __m512 test_mm512_andnot_ps (__m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_andnot_ps
+  // CHECK-LABEL: test_mm512_andnot_ps
   // CHECK: xor <16 x i32> %{{.*}}, splat (i32 -1)
   // CHECK: and <16 x i32>
   return (__m512) _mm512_andnot_ps(__A, __B);
 }
-TEST_CONSTEXPR(match_m512(_mm512_andnot_ps((__m256){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f, -4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m256){+0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f}), +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f));
+TEST_CONSTEXPR(match_m512(_mm512_andnot_ps((__m512){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f, -4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m512){+0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f}), +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f));
 
 __m512 test_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_andnot_ps
+  // CHECK-LABEL: test_mm512_mask_andnot_ps
   // CHECK: xor <16 x i32> %{{.*}}, splat (i32 -1)
   // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
@@ -434,7 +437,7 @@ __m512 test_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512
 }
 
 __m512 test_mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_andnot_ps
+  // CHECK-LABEL: test_mm512_maskz_andnot_ps
   // CHECK: xor <16 x i32> %{{.*}}, splat (i32 -1)
   // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
@@ -442,491 +445,491 @@ __m512 test_mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B) {
 }
 
 __m512i test_mm512_cvtpd_epi64(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_cvtpd_epi64
+  // CHECK-LABEL: test_mm512_cvtpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2qq.512
   return _mm512_cvtpd_epi64(__A); 
 }
 
 __m512i test_mm512_mask_cvtpd_epi64(__m512i __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtpd_epi64
+  // CHECK-LABEL: test_mm512_mask_cvtpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2qq.512
   return _mm512_mask_cvtpd_epi64(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvtpd_epi64(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtpd_epi64
+  // CHECK-LABEL: test_mm512_maskz_cvtpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2qq.512
   return _mm512_maskz_cvtpd_epi64(__U, __A); 
 }
 
 __m512i test_mm512_cvt_roundpd_epi64(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_cvt_roundpd_epi64
+  // CHECK-LABEL: test_mm512_cvt_roundpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2qq.512
   return _mm512_cvt_roundpd_epi64(__A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_mask_cvt_roundpd_epi64(__m512i __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvt_roundpd_epi64
+  // CHECK-LABEL: test_mm512_mask_cvt_roundpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2qq.512
   return _mm512_mask_cvt_roundpd_epi64(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_maskz_cvt_roundpd_epi64(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvt_roundpd_epi64
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2qq.512
   return _mm512_maskz_cvt_roundpd_epi64(__U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_cvtpd_epu64(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_cvtpd_epu64
+  // CHECK-LABEL: test_mm512_cvtpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2uqq.512
   return _mm512_cvtpd_epu64(__A); 
 }
 
 __m512i test_mm512_mask_cvtpd_epu64(__m512i __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtpd_epu64
+  // CHECK-LABEL: test_mm512_mask_cvtpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2uqq.512
   return _mm512_mask_cvtpd_epu64(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvtpd_epu64(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtpd_epu64
+  // CHECK-LABEL: test_mm512_maskz_cvtpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2uqq.512
   return _mm512_maskz_cvtpd_epu64(__U, __A); 
 }
 
 __m512i test_mm512_cvt_roundpd_epu64(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_cvt_roundpd_epu64
+  // CHECK-LABEL: test_mm512_cvt_roundpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2uqq.512
   return _mm512_cvt_roundpd_epu64(__A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_mask_cvt_roundpd_epu64(__m512i __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvt_roundpd_epu64
+  // CHECK-LABEL: test_mm512_mask_cvt_roundpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2uqq.512
   return _mm512_mask_cvt_roundpd_epu64(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_maskz_cvt_roundpd_epu64(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvt_roundpd_epu64
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2uqq.512
   return _mm512_maskz_cvt_roundpd_epu64(__U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_cvtps_epi64(__m256 __A) {
-  // CHECK-LABEL: @test_mm512_cvtps_epi64
+  // CHECK-LABEL: test_mm512_cvtps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtps2qq.512
   return _mm512_cvtps_epi64(__A); 
 }
 
 __m512i test_mm512_mask_cvtps_epi64(__m512i __W, __mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtps_epi64
+  // CHECK-LABEL: test_mm512_mask_cvtps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtps2qq.512
   return _mm512_mask_cvtps_epi64(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvtps_epi64(__mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtps_epi64
+  // CHECK-LABEL: test_mm512_maskz_cvtps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtps2qq.512
   return _mm512_maskz_cvtps_epi64(__U, __A); 
 }
 
 __m512i test_mm512_cvt_roundps_epi64(__m256 __A) {
-  // CHECK-LABEL: @test_mm512_cvt_roundps_epi64
+  // CHECK-LABEL: test_mm512_cvt_roundps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtps2qq.512
   return _mm512_cvt_roundps_epi64(__A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_mask_cvt_roundps_epi64(__m512i __W, __mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvt_roundps_epi64
+  // CHECK-LABEL: test_mm512_mask_cvt_roundps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtps2qq.512
   return _mm512_mask_cvt_roundps_epi64(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_maskz_cvt_roundps_epi64(__mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvt_roundps_epi64
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtps2qq.512
   return _mm512_maskz_cvt_roundps_epi64(__U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_cvtps_epu64(__m256 __A) {
-  // CHECK-LABEL: @test_mm512_cvtps_epu64
+  // CHECK-LABEL: test_mm512_cvtps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtps2uqq.512
   return _mm512_cvtps_epu64(__A); 
 }
 
 __m512i test_mm512_mask_cvtps_epu64(__m512i __W, __mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtps_epu64
+  // CHECK-LABEL: test_mm512_mask_cvtps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtps2uqq.512
   return _mm512_mask_cvtps_epu64(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvtps_epu64(__mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtps_epu64
+  // CHECK-LABEL: test_mm512_maskz_cvtps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtps2uqq.512
   return _mm512_maskz_cvtps_epu64(__U, __A); 
 }
 
 __m512i test_mm512_cvt_roundps_epu64(__m256 __A) {
-  // CHECK-LABEL: @test_mm512_cvt_roundps_epu64
+  // CHECK-LABEL: test_mm512_cvt_roundps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtps2uqq.512
   return _mm512_cvt_roundps_epu64(__A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_mask_cvt_roundps_epu64(__m512i __W, __mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvt_roundps_epu64
+  // CHECK-LABEL: test_mm512_mask_cvt_roundps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtps2uqq.512
   return _mm512_mask_cvt_roundps_epu64(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_maskz_cvt_roundps_epu64(__mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvt_roundps_epu64
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtps2uqq.512
   return _mm512_maskz_cvt_roundps_epu64(__U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512d test_mm512_cvtepi64_pd(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepi64_pd
+  // CHECK-LABEL: test_mm512_cvtepi64_pd
   // CHECK: sitofp <8 x i64> %{{.*}} to <8 x double>
   return _mm512_cvtepi64_pd(__A); 
 }
 
 __m512d test_mm512_mask_cvtepi64_pd(__m512d __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi64_pd
+  // CHECK-LABEL: test_mm512_mask_cvtepi64_pd
   // CHECK: sitofp <8 x i64> %{{.*}} to <8 x double>
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_cvtepi64_pd(__W, __U, __A); 
 }
 
 __m512d test_mm512_maskz_cvtepi64_pd(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepi64_pd
+  // CHECK-LABEL: test_mm512_maskz_cvtepi64_pd
   // CHECK: sitofp <8 x i64> %{{.*}} to <8 x double>
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_cvtepi64_pd(__U, __A); 
 }
 
 __m512d test_mm512_cvt_roundepi64_pd(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvt_roundepi64_pd
+  // CHECK-LABEL: test_mm512_cvt_roundepi64_pd
   // CHECK: @llvm.x86.avx512.sitofp.round.v8f64.v8i64
   return _mm512_cvt_roundepi64_pd(__A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512d test_mm512_mask_cvt_roundepi64_pd(__m512d __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvt_roundepi64_pd
+  // CHECK-LABEL: test_mm512_mask_cvt_roundepi64_pd
   // CHECK: @llvm.x86.avx512.sitofp.round.v8f64.v8i64
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_cvt_roundepi64_pd(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512d test_mm512_maskz_cvt_roundepi64_pd(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvt_roundepi64_pd
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundepi64_pd
   // CHECK: @llvm.x86.avx512.sitofp.round.v8f64.v8i64
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_cvt_roundepi64_pd(__U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m256 test_mm512_cvtepi64_ps(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepi64_ps
+  // CHECK-LABEL: test_mm512_cvtepi64_ps
   // CHECK: sitofp <8 x i64> %{{.*}} to <8 x float>
   return _mm512_cvtepi64_ps(__A); 
 }
 
 __m256 test_mm512_mask_cvtepi64_ps(__m256 __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi64_ps
+  // CHECK-LABEL: test_mm512_mask_cvtepi64_ps
   // CHECK: sitofp <8 x i64> %{{.*}} to <8 x float>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_mask_cvtepi64_ps(__W, __U, __A); 
 }
 
 __m256 test_mm512_maskz_cvtepi64_ps(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepi64_ps
+  // CHECK-LABEL: test_mm512_maskz_cvtepi64_ps
   // CHECK: sitofp <8 x i64> %{{.*}} to <8 x float>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_maskz_cvtepi64_ps(__U, __A); 
 }
 
 __m256 test_mm512_cvt_roundepi64_ps(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvt_roundepi64_ps
+  // CHECK-LABEL: test_mm512_cvt_roundepi64_ps
   // CHECK: @llvm.x86.avx512.sitofp.round.v8f32.v8i64
   return _mm512_cvt_roundepi64_ps(__A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m256 test_mm512_mask_cvt_roundepi64_ps(__m256 __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvt_roundepi64_ps
+  // CHECK-LABEL: test_mm512_mask_cvt_roundepi64_ps
   // CHECK: @llvm.x86.avx512.sitofp.round.v8f32.v8i64
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_mask_cvt_roundepi64_ps(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m256 test_mm512_maskz_cvt_roundepi64_ps(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvt_roundepi64_ps
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundepi64_ps
   // CHECK: @llvm.x86.avx512.sitofp.round.v8f32.v8i64
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_maskz_cvt_roundepi64_ps(__U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512i test_mm512_cvttpd_epi64(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_cvttpd_epi64
+  // CHECK-LABEL: test_mm512_cvttpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2qq.512
   return _mm512_cvttpd_epi64(__A); 
 }
 
 __m512i test_mm512_mask_cvttpd_epi64(__m512i __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvttpd_epi64
+  // CHECK-LABEL: test_mm512_mask_cvttpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2qq.512
   return _mm512_mask_cvttpd_epi64(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvttpd_epi64(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvttpd_epi64
+  // CHECK-LABEL: test_mm512_maskz_cvttpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2qq.512
   return _mm512_maskz_cvttpd_epi64(__U, __A); 
 }
 
 __m512i test_mm512_cvtt_roundpd_epi64(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_cvtt_roundpd_epi64
+  // CHECK-LABEL: test_mm512_cvtt_roundpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2qq.512
   return _mm512_cvtt_roundpd_epi64(__A, _MM_FROUND_NO_EXC);
 }
 
 __m512i test_mm512_mask_cvtt_roundpd_epi64(__m512i __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtt_roundpd_epi64
+  // CHECK-LABEL: test_mm512_mask_cvtt_roundpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2qq.512
   return _mm512_mask_cvtt_roundpd_epi64(__W, __U, __A, _MM_FROUND_NO_EXC);
 }
 
 __m512i test_mm512_maskz_cvtt_roundpd_epi64(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtt_roundpd_epi64
+  // CHECK-LABEL: test_mm512_maskz_cvtt_roundpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2qq.512
   return _mm512_maskz_cvtt_roundpd_epi64(__U, __A, _MM_FROUND_NO_EXC);
 }
 
 __m512i test_mm512_cvttpd_epu64(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_cvttpd_epu64
+  // CHECK-LABEL: test_mm512_cvttpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2uqq.512
   return _mm512_cvttpd_epu64(__A); 
 }
 
 __m512i test_mm512_mask_cvttpd_epu64(__m512i __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvttpd_epu64
+  // CHECK-LABEL: test_mm512_mask_cvttpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2uqq.512
   return _mm512_mask_cvttpd_epu64(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvttpd_epu64(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvttpd_epu64
+  // CHECK-LABEL: test_mm512_maskz_cvttpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2uqq.512
   return _mm512_maskz_cvttpd_epu64(__U, __A); 
 }
 
 __m512i test_mm512_cvtt_roundpd_epu64(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_cvtt_roundpd_epu64
+  // CHECK-LABEL: test_mm512_cvtt_roundpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2uqq.512
   return _mm512_cvtt_roundpd_epu64(__A, _MM_FROUND_NO_EXC);
 }
 
 __m512i test_mm512_mask_cvtt_roundpd_epu64(__m512i __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtt_roundpd_epu64
+  // CHECK-LABEL: test_mm512_mask_cvtt_roundpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2uqq.512
   return _mm512_mask_cvtt_roundpd_epu64(__W, __U, __A, _MM_FROUND_NO_EXC);
 }
 
 __m512i test_mm512_maskz_cvtt_roundpd_epu64(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtt_roundpd_epu64
+  // CHECK-LABEL: test_mm512_maskz_cvtt_roundpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2uqq.512
   return _mm512_maskz_cvtt_roundpd_epu64(__U, __A, _MM_FROUND_NO_EXC);
 }
 
 __m512i test_mm512_cvttps_epi64(__m256 __A) {
-  // CHECK-LABEL: @test_mm512_cvttps_epi64
+  // CHECK-LABEL: test_mm512_cvttps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttps2qq.512
   return _mm512_cvttps_epi64(__A); 
 }
 
 __m512i test_mm512_mask_cvttps_epi64(__m512i __W, __mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvttps_epi64
+  // CHECK-LABEL: test_mm512_mask_cvttps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttps2qq.512
   return _mm512_mask_cvttps_epi64(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvttps_epi64(__mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvttps_epi64
+  // CHECK-LABEL: test_mm512_maskz_cvttps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttps2qq.512
   return _mm512_maskz_cvttps_epi64(__U, __A); 
 }
 
 __m512i test_mm512_cvtt_roundps_epi64(__m256 __A) {
-  // CHECK-LABEL: @test_mm512_cvtt_roundps_epi64
+  // CHECK-LABEL: test_mm512_cvtt_roundps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttps2qq.512
   return _mm512_cvtt_roundps_epi64(__A, _MM_FROUND_NO_EXC);
 }
 
 __m512i test_mm512_mask_cvtt_roundps_epi64(__m512i __W, __mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtt_roundps_epi64
+  // CHECK-LABEL: test_mm512_mask_cvtt_roundps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttps2qq.512
   return _mm512_mask_cvtt_roundps_epi64(__W, __U, __A, _MM_FROUND_NO_EXC);
 }
 
 __m512i test_mm512_maskz_cvtt_roundps_epi64(__mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtt_roundps_epi64
+  // CHECK-LABEL: test_mm512_maskz_cvtt_roundps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttps2qq.512
   return _mm512_maskz_cvtt_roundps_epi64(__U, __A, _MM_FROUND_NO_EXC);
 }
 
 __m512i test_mm512_cvttps_epu64(__m256 __A) {
-  // CHECK-LABEL: @test_mm512_cvttps_epu64
+  // CHECK-LABEL: test_mm512_cvttps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttps2uqq.512
   return _mm512_cvttps_epu64(__A); 
 }
 
 __m512i test_mm512_mask_cvttps_epu64(__m512i __W, __mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvttps_epu64
+  // CHECK-LABEL: test_mm512_mask_cvttps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttps2uqq.512
   return _mm512_mask_cvttps_epu64(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvttps_epu64(__mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvttps_epu64
+  // CHECK-LABEL: test_mm512_maskz_cvttps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttps2uqq.512
   return _mm512_maskz_cvttps_epu64(__U, __A); 
 }
 
 __m512i test_mm512_cvtt_roundps_epu64(__m256 __A) {
-  // CHECK-LABEL: @test_mm512_cvtt_roundps_epu64
+  // CHECK-LABEL: test_mm512_cvtt_roundps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttps2uqq.512
   return _mm512_cvtt_roundps_epu64(__A, _MM_FROUND_NO_EXC);
 }
 
 __m512i test_mm512_mask_cvtt_roundps_epu64(__m512i __W, __mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtt_roundps_epu64
+  // CHECK-LABEL: test_mm512_mask_cvtt_roundps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttps2uqq.512
   return _mm512_mask_cvtt_roundps_epu64(__W, __U, __A, _MM_FROUND_NO_EXC);
 }
 
 __m512i test_mm512_maskz_cvtt_roundps_epu64(__mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtt_roundps_epu64
+  // CHECK-LABEL: test_mm512_maskz_cvtt_roundps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttps2uqq.512
   return _mm512_maskz_cvtt_roundps_epu64(__U, __A, _MM_FROUND_NO_EXC);
 }
 
 __m512d test_mm512_cvtepu64_pd(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepu64_pd
+  // CHECK-LABEL: test_mm512_cvtepu64_pd
   // CHECK: uitofp <8 x i64> %{{.*}} to <8 x double>
   return _mm512_cvtepu64_pd(__A); 
 }
 
 __m512d test_mm512_mask_cvtepu64_pd(__m512d __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepu64_pd
+  // CHECK-LABEL: test_mm512_mask_cvtepu64_pd
   // CHECK: uitofp <8 x i64> %{{.*}} to <8 x double>
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_cvtepu64_pd(__W, __U, __A); 
 }
 
 __m512d test_mm512_maskz_cvtepu64_pd(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepu64_pd
+  // CHECK-LABEL: test_mm512_maskz_cvtepu64_pd
   // CHECK: uitofp <8 x i64> %{{.*}} to <8 x double>
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_cvtepu64_pd(__U, __A); 
 }
 
 __m512d test_mm512_cvt_roundepu64_pd(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvt_roundepu64_pd
+  // CHECK-LABEL: test_mm512_cvt_roundepu64_pd
   // CHECK: @llvm.x86.avx512.uitofp.round.v8f64.v8i64
   return _mm512_cvt_roundepu64_pd(__A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512d test_mm512_mask_cvt_roundepu64_pd(__m512d __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvt_roundepu64_pd
+  // CHECK-LABEL: test_mm512_mask_cvt_roundepu64_pd
   // CHECK: @llvm.x86.avx512.uitofp.round.v8f64.v8i64
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_cvt_roundepu64_pd(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512d test_mm512_maskz_cvt_roundepu64_pd(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvt_roundepu64_pd
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundepu64_pd
   // CHECK: @llvm.x86.avx512.uitofp.round.v8f64.v8i64
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_cvt_roundepu64_pd(__U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m256 test_mm512_cvtepu64_ps(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepu64_ps
+  // CHECK-LABEL: test_mm512_cvtepu64_ps
   // CHECK: uitofp <8 x i64> %{{.*}} to <8 x float>
   return _mm512_cvtepu64_ps(__A); 
 }
 
 __m256 test_mm512_mask_cvtepu64_ps(__m256 __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepu64_ps
+  // CHECK-LABEL: test_mm512_mask_cvtepu64_ps
   // CHECK: uitofp <8 x i64> %{{.*}} to <8 x float>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_mask_cvtepu64_ps(__W, __U, __A); 
 }
 
 __m256 test_mm512_maskz_cvtepu64_ps(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepu64_ps
+  // CHECK-LABEL: test_mm512_maskz_cvtepu64_ps
   // CHECK: uitofp <8 x i64> %{{.*}} to <8 x float>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_maskz_cvtepu64_ps(__U, __A); 
 }
 
 __m256 test_mm512_cvt_roundepu64_ps(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvt_roundepu64_ps
+  // CHECK-LABEL: test_mm512_cvt_roundepu64_ps
   // CHECK: @llvm.x86.avx512.uitofp.round.v8f32.v8i64
   return _mm512_cvt_roundepu64_ps(__A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m256 test_mm512_mask_cvt_roundepu64_ps(__m256 __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvt_roundepu64_ps
+  // CHECK-LABEL: test_mm512_mask_cvt_roundepu64_ps
   // CHECK: @llvm.x86.avx512.uitofp.round.v8f32.v8i64
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_mask_cvt_roundepu64_ps(__W, __U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m256 test_mm512_maskz_cvt_roundepu64_ps(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvt_roundepu64_ps
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundepu64_ps
   // CHECK: @llvm.x86.avx512.uitofp.round.v8f32.v8i64
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_maskz_cvt_roundepu64_ps(__U, __A, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m512d test_mm512_range_pd(__m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_range_pd
+  // CHECK-LABEL: test_mm512_range_pd
   // CHECK: @llvm.x86.avx512.mask.range.pd.512
   return _mm512_range_pd(__A, __B, 4); 
 }
 
 __m512d test_mm512_mask_range_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_range_pd
+  // CHECK-LABEL: test_mm512_mask_range_pd
   // CHECK: @llvm.x86.avx512.mask.range.pd.512
   return _mm512_mask_range_pd(__W, __U, __A, __B, 4); 
 }
 
 __m512d test_mm512_maskz_range_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_range_pd
+  // CHECK-LABEL: test_mm512_maskz_range_pd
   // CHECK: @llvm.x86.avx512.mask.range.pd.512
   return _mm512_maskz_range_pd(__U, __A, __B, 4); 
 }
 
 __m512d test_mm512_range_round_pd(__m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_range_round_pd
+  // CHECK-LABEL: test_mm512_range_round_pd
   // CHECK: @llvm.x86.avx512.mask.range.pd.512
   return _mm512_range_round_pd(__A, __B, 4, 8); 
 }
 
 __m512d test_mm512_mask_range_round_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_range_round_pd
+  // CHECK-LABEL: test_mm512_mask_range_round_pd
   // CHECK: @llvm.x86.avx512.mask.range.pd.512
   return _mm512_mask_range_round_pd(__W, __U, __A, __B, 4, 8); 
 }
 
 __m512d test_mm512_maskz_range_round_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_range_round_pd
+  // CHECK-LABEL: test_mm512_maskz_range_round_pd
   // CHECK: @llvm.x86.avx512.mask.range.pd.512
   return _mm512_maskz_range_round_pd(__U, __A, __B, 4, 8); 
 }
 
 __m128d test_mm512_range_round_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm512_range_round_sd
+  // CHECK-LABEL: test_mm512_range_round_sd
   // CHECK: @llvm.x86.avx512.mask.range.sd
   return _mm_range_round_sd(__A, __B, 4, 8); 
 }
@@ -938,31 +941,31 @@ __m128d test_mm512_mask_range_round_sd(__m128d __W, __mmask8 __U, __m128d __A, _
 }
 
 __m128d test_mm512_maskz_range_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_range_round_sd
+  // CHECK-LABEL: test_mm512_maskz_range_round_sd
   // CHECK: @llvm.x86.avx512.mask.range.sd
   return _mm_maskz_range_round_sd(__U, __A, __B, 4, 8); 
 }
 
 __m128 test_mm512_range_round_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm512_range_round_ss
+  // CHECK-LABEL: test_mm512_range_round_ss
   // CHECK: @llvm.x86.avx512.mask.range.ss
   return _mm_range_round_ss(__A, __B, 4, 8); 
 }
 
 __m128 test_mm512_mask_range_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm512_mask_range_round_ss
+  // CHECK-LABEL: test_mm512_mask_range_round_ss
   // CHECK: @llvm.x86.avx512.mask.range.ss
   return _mm_mask_range_round_ss(__W, __U, __A, __B, 4, 8); 
 }
 
 __m128 test_mm512_maskz_range_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_range_round_ss
+  // CHECK-LABEL: test_mm512_maskz_range_round_ss
   // CHECK: @llvm.x86.avx512.mask.range.ss
   return _mm_maskz_range_round_ss(__U, __A, __B, 4, 8); 
 }
 
 __m128d test_mm_range_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_range_sd
+  // CHECK-LABEL: test_mm_range_sd
   // CHECK: @llvm.x86.avx512.mask.range.sd
   return _mm_range_sd(__A, __B, 4); 
 }
@@ -974,558 +977,558 @@ __m128d test_mm_mask_range_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __
 }
 
 __m128d test_mm_maskz_range_sd(__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_range_sd
+  // CHECK-LABEL: test_mm_maskz_range_sd
   // CHECK: @llvm.x86.avx512.mask.range.sd
   return _mm_maskz_range_sd(__U, __A, __B, 4); 
 }
 
 __m128 test_mm_range_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_range_ss
+  // CHECK-LABEL: test_mm_range_ss
   // CHECK: @llvm.x86.avx512.mask.range.ss
   return _mm_range_ss(__A, __B, 4); 
 }
 
 __m128 test_mm_mask_range_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_range_ss
+  // CHECK-LABEL: test_mm_mask_range_ss
   // CHECK: @llvm.x86.avx512.mask.range.ss
   return _mm_mask_range_ss(__W, __U, __A, __B, 4); 
 }
 
 __m128 test_mm_maskz_range_ss(__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_range_ss
+  // CHECK-LABEL: test_mm_maskz_range_ss
   // CHECK: @llvm.x86.avx512.mask.range.ss
   return _mm_maskz_range_ss(__U, __A, __B, 4); 
 }
 
 __m512 test_mm512_range_ps(__m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_range_ps
+  // CHECK-LABEL: test_mm512_range_ps
   // CHECK: @llvm.x86.avx512.mask.range.ps.512
   return _mm512_range_ps(__A, __B, 4); 
 }
 
 __m512 test_mm512_mask_range_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_range_ps
+  // CHECK-LABEL: test_mm512_mask_range_ps
   // CHECK: @llvm.x86.avx512.mask.range.ps.512
   return _mm512_mask_range_ps(__W, __U, __A, __B, 4); 
 }
 
 __m512 test_mm512_maskz_range_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_range_ps
+  // CHECK-LABEL: test_mm512_maskz_range_ps
   // CHECK: @llvm.x86.avx512.mask.range.ps.512
   return _mm512_maskz_range_ps(__U, __A, __B, 4); 
 }
 
 __m512 test_mm512_range_round_ps(__m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_range_round_ps
+  // CHECK-LABEL: test_mm512_range_round_ps
   // CHECK: @llvm.x86.avx512.mask.range.ps.512
   return _mm512_range_round_ps(__A, __B, 4, 8); 
 }
 
 __m512 test_mm512_mask_range_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_range_round_ps
+  // CHECK-LABEL: test_mm512_mask_range_round_ps
   // CHECK: @llvm.x86.avx512.mask.range.ps.512
   return _mm512_mask_range_round_ps(__W, __U, __A, __B, 4, 8); 
 }
 
 __m512 test_mm512_maskz_range_round_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_range_round_ps
+  // CHECK-LABEL: test_mm512_maskz_range_round_ps
   // CHECK: @llvm.x86.avx512.mask.range.ps.512
   return _mm512_maskz_range_round_ps(__U, __A, __B, 4, 8); 
 }
 
 __m512d test_mm512_reduce_pd(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_reduce_pd
+  // CHECK-LABEL: test_mm512_reduce_pd
   // CHECK: @llvm.x86.avx512.mask.reduce.pd.512
   return _mm512_reduce_pd(__A, 4); 
 }
 
 __m512d test_mm512_mask_reduce_pd(__m512d __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_reduce_pd
+  // CHECK-LABEL: test_mm512_mask_reduce_pd
   // CHECK: @llvm.x86.avx512.mask.reduce.pd.512
   return _mm512_mask_reduce_pd(__W, __U, __A, 4); 
 }
 
 __m512d test_mm512_maskz_reduce_pd(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_reduce_pd
+  // CHECK-LABEL: test_mm512_maskz_reduce_pd
   // CHECK: @llvm.x86.avx512.mask.reduce.pd.512
   return _mm512_maskz_reduce_pd(__U, __A, 4); 
 }
 
 __m512 test_mm512_reduce_ps(__m512 __A) {
-  // CHECK-LABEL: @test_mm512_reduce_ps
+  // CHECK-LABEL: test_mm512_reduce_ps
   // CHECK: @llvm.x86.avx512.mask.reduce.ps.512
   return _mm512_reduce_ps(__A, 4); 
 }
 
 __m512 test_mm512_mask_reduce_ps(__m512 __W, __mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_mask_reduce_ps
+  // CHECK-LABEL: test_mm512_mask_reduce_ps
   // CHECK: @llvm.x86.avx512.mask.reduce.ps.512
   return _mm512_mask_reduce_ps(__W, __U, __A, 4); 
 }
 
 __m512 test_mm512_maskz_reduce_ps(__mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_reduce_ps
+  // CHECK-LABEL: test_mm512_maskz_reduce_ps
   // CHECK: @llvm.x86.avx512.mask.reduce.ps.512
   return _mm512_maskz_reduce_ps(__U, __A, 4); 
 }
 
 __m512d test_mm512_reduce_round_pd(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_reduce_round_pd
+  // CHECK-LABEL: test_mm512_reduce_round_pd
   // CHECK: @llvm.x86.avx512.mask.reduce.pd.512
   return _mm512_reduce_round_pd(__A, 4, 8); 
 }
 
 __m512d test_mm512_mask_reduce_round_pd(__m512d __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_reduce_round_pd
+  // CHECK-LABEL: test_mm512_mask_reduce_round_pd
   // CHECK: @llvm.x86.avx512.mask.reduce.pd.512
   return _mm512_mask_reduce_round_pd(__W, __U, __A, 4, 8); 
 }
 
 __m512d test_mm512_maskz_reduce_round_pd(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_reduce_round_pd
+  // CHECK-LABEL: test_mm512_maskz_reduce_round_pd
   // CHECK: @llvm.x86.avx512.mask.reduce.pd.512
   return _mm512_maskz_reduce_round_pd(__U, __A, 4, 8);
 }
 
 __m512 test_mm512_reduce_round_ps(__m512 __A) {
-  // CHECK-LABEL: @test_mm512_reduce_round_ps
+  // CHECK-LABEL: test_mm512_reduce_round_ps
   // CHECK: @llvm.x86.avx512.mask.reduce.ps.512
   return _mm512_reduce_round_ps(__A, 4, 8); 
 }
 
 __m512 test_mm512_mask_reduce_round_ps(__m512 __W, __mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_mask_reduce_round_ps
+  // CHECK-LABEL: test_mm512_mask_reduce_round_ps
   // CHECK: @llvm.x86.avx512.mask.reduce.ps.512
   return _mm512_mask_reduce_round_ps(__W, __U, __A, 4, 8); 
 }
 
 __m512 test_mm512_maskz_reduce_round_ps(__mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_reduce_round_ps
+  // CHECK-LABEL: test_mm512_maskz_reduce_round_ps
   // CHECK: @llvm.x86.avx512.mask.reduce.ps.512
   return _mm512_maskz_reduce_round_ps(__U, __A, 4, 8); 
 }
 
 __m128 test_mm_reduce_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_reduce_ss
+  // CHECK-LABEL: test_mm_reduce_ss
   // CHECK: @llvm.x86.avx512.mask.reduce.ss
   return _mm_reduce_ss(__A, __B, 4);
 }
 
 __m128 test_mm_mask_reduce_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_reduce_ss
+  // CHECK-LABEL: test_mm_mask_reduce_ss
   // CHECK: @llvm.x86.avx512.mask.reduce.ss
   return _mm_mask_reduce_ss(__W, __U, __A, __B, 4);
 }
 
 __m128 test_mm_maskz_reduce_ss(__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_reduce_ss
+  // CHECK-LABEL: test_mm_maskz_reduce_ss
   // CHECK: @llvm.x86.avx512.mask.reduce.ss
   return _mm_maskz_reduce_ss(__U, __A, __B, 4);
 }
 
 __m128 test_mm_reduce_round_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_reduce_round_ss
+  // CHECK-LABEL: test_mm_reduce_round_ss
   // CHECK: @llvm.x86.avx512.mask.reduce.ss
   return _mm_reduce_round_ss(__A, __B, 4, 8);
 }
 
 __m128 test_mm_mask_reduce_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_reduce_round_ss
+  // CHECK-LABEL: test_mm_mask_reduce_round_ss
   // CHECK: @llvm.x86.avx512.mask.reduce.ss
   return _mm_mask_reduce_round_ss(__W, __U, __A, __B, 4, 8);
 }
 
 __m128 test_mm_maskz_reduce_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_reduce_round_ss
+  // CHECK-LABEL: test_mm_maskz_reduce_round_ss
   // CHECK: @llvm.x86.avx512.mask.reduce.ss
   return _mm_maskz_reduce_round_ss(__U, __A, __B, 4, 8);
 }
 
 __m128d test_mm_reduce_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_reduce_sd
+  // CHECK-LABEL: test_mm_reduce_sd
   // CHECK: @llvm.x86.avx512.mask.reduce.sd
   return _mm_reduce_sd(__A, __B, 4);
 }
 
 __m128d test_mm_mask_reduce_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_reduce_sd
+  // CHECK-LABEL: test_mm_mask_reduce_sd
   // CHECK: @llvm.x86.avx512.mask.reduce.sd
   return _mm_mask_reduce_sd(__W, __U, __A, __B, 4);
 }
 
 __m128d test_mm_maskz_reduce_sd(__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_reduce_sd
+  // CHECK-LABEL: test_mm_maskz_reduce_sd
   // CHECK: @llvm.x86.avx512.mask.reduce.sd
   return _mm_maskz_reduce_sd(__U, __A, __B, 4);
 }
 
 __m128d test_mm_reduce_round_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_reduce_round_sd
+  // CHECK-LABEL: test_mm_reduce_round_sd
   // CHECK: @llvm.x86.avx512.mask.reduce.sd
   return _mm_reduce_round_sd(__A, __B, 4, 8);
 }
 
 __m128d test_mm_mask_reduce_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_reduce_round_sd
+  // CHECK-LABEL: test_mm_mask_reduce_round_sd
   // CHECK: @llvm.x86.avx512.mask.reduce.sd
   return _mm_mask_reduce_round_sd(__W, __U, __A, __B, 4, 8);
 }
 
 __m128d test_mm_maskz_reduce_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_reduce_round_sd
+  // CHECK-LABEL: test_mm_maskz_reduce_round_sd
   // CHECK: @llvm.x86.avx512.mask.reduce.sd
   return _mm_maskz_reduce_round_sd(__U, __A, __B, 4, 8);
 }
 
 __mmask16 test_mm512_movepi32_mask(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_movepi32_mask
+  // CHECK-LABEL: test_mm512_movepi32_mask
   // CHECK: [[CMP:%.*]] = icmp slt <16 x i32> %{{.*}}, zeroinitializer
   return _mm512_movepi32_mask(__A); 
 }
 
 __m512i test_mm512_movm_epi32(__mmask16 __A) {
-  // CHECK-LABEL: @test_mm512_movm_epi32
+  // CHECK-LABEL: test_mm512_movm_epi32
   // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1>
   // CHECK: %vpmovm2.i = sext <16 x i1> %{{.*}} to <16 x i32>
   return _mm512_movm_epi32(__A); 
 }
 
 __m512i test_mm512_movm_epi64(__mmask8 __A) {
-  // CHECK-LABEL: @test_mm512_movm_epi64
+  // CHECK-LABEL: test_mm512_movm_epi64
   // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: %vpmovm2.i = sext <8 x i1> %{{.*}} to <8 x i64>
   return _mm512_movm_epi64(__A); 
 }
 
 __mmask8 test_mm512_movepi64_mask(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_movepi64_mask
+  // CHECK-LABEL: test_mm512_movepi64_mask
   // CHECK: [[CMP:%.*]] = icmp slt <8 x i64> %{{.*}}, zeroinitializer
   return _mm512_movepi64_mask(__A); 
 }
 
 __m512 test_mm512_broadcast_f32x2(__m128 __A) {
-  // CHECK-LABEL: @test_mm512_broadcast_f32x2
+  // CHECK-LABEL: test_mm512_broadcast_f32x2
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   return _mm512_broadcast_f32x2(__A); 
 }
 
 __m512 test_mm512_mask_broadcast_f32x2(__m512 __O, __mmask16 __M, __m128 __A) {
-  // CHECK-LABEL: @test_mm512_mask_broadcast_f32x2
+  // CHECK-LABEL: test_mm512_mask_broadcast_f32x2
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_broadcast_f32x2(__O, __M, __A); 
 }
 
 __m512 test_mm512_maskz_broadcast_f32x2(__mmask16 __M, __m128 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_broadcast_f32x2
+  // CHECK-LABEL: test_mm512_maskz_broadcast_f32x2
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_broadcast_f32x2(__M, __A); 
 }
 
 __m512 test_mm512_broadcast_f32x8(float const* __A) {
-  // CHECK-LABEL: @test_mm512_broadcast_f32x8
+  // CHECK-LABEL: test_mm512_broadcast_f32x8
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm512_broadcast_f32x8(_mm256_loadu_ps(__A)); 
 }
 
 __m512 test_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, float const* __A) {
-  // CHECK-LABEL: @test_mm512_mask_broadcast_f32x8
+  // CHECK-LABEL: test_mm512_mask_broadcast_f32x8
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_broadcast_f32x8(__O, __M, _mm256_loadu_ps(__A)); 
 }
 
 __m512 test_mm512_maskz_broadcast_f32x8(__mmask16 __M, float const* __A) {
-  // CHECK-LABEL: @test_mm512_maskz_broadcast_f32x8
+  // CHECK-LABEL: test_mm512_maskz_broadcast_f32x8
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_broadcast_f32x8(__M, _mm256_loadu_ps(__A)); 
 }
 
 __m512d test_mm512_broadcast_f64x2(double const* __A) {
-  // CHECK-LABEL: @test_mm512_broadcast_f64x2
+  // CHECK-LABEL: test_mm512_broadcast_f64x2
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   return _mm512_broadcast_f64x2(_mm_loadu_pd(__A)); 
 }
 
 __m512d test_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, double const* __A) {
-  // CHECK-LABEL: @test_mm512_mask_broadcast_f64x2
+  // CHECK-LABEL: test_mm512_mask_broadcast_f64x2
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_broadcast_f64x2(__O, __M, _mm_loadu_pd(__A)); 
 }
 
 __m512d test_mm512_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) {
-  // CHECK-LABEL: @test_mm512_maskz_broadcast_f64x2
+  // CHECK-LABEL: test_mm512_maskz_broadcast_f64x2
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_broadcast_f64x2(__M, _mm_loadu_pd(__A)); 
 }
 
 __m512i test_mm512_broadcast_i32x2(__m128i __A) {
-  // CHECK-LABEL: @test_mm512_broadcast_i32x2
+  // CHECK-LABEL: test_mm512_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   return _mm512_broadcast_i32x2(__A); 
 }
 
 __m512i test_mm512_mask_broadcast_i32x2(__m512i __O, __mmask16 __M, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_mask_broadcast_i32x2
+  // CHECK-LABEL: test_mm512_mask_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_broadcast_i32x2(__O, __M, __A); 
 }
 
 __m512i test_mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_broadcast_i32x2
+  // CHECK-LABEL: test_mm512_maskz_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_broadcast_i32x2(__M, __A); 
 }
 
 __m512i test_mm512_broadcast_i32x8(__m256i const* __A) {
-  // CHECK-LABEL: @test_mm512_broadcast_i32x8
+  // CHECK-LABEL: test_mm512_broadcast_i32x8
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm512_broadcast_i32x8(_mm256_loadu_si256(__A)); 
 }
 
 __m512i test_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i const* __A) {
-  // CHECK-LABEL: @test_mm512_mask_broadcast_i32x8
+  // CHECK-LABEL: test_mm512_mask_broadcast_i32x8
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_broadcast_i32x8(__O, __M, _mm256_loadu_si256(__A)); 
 }
 
 __m512i test_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i const* __A) {
-  // CHECK-LABEL: @test_mm512_maskz_broadcast_i32x8
+  // CHECK-LABEL: test_mm512_maskz_broadcast_i32x8
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_broadcast_i32x8(__M, _mm256_loadu_si256(__A)); 
 }
 
 __m512i test_mm512_broadcast_i64x2(__m128i const* __A) {
-  // CHECK-LABEL: @test_mm512_broadcast_i64x2
+  // CHECK-LABEL: test_mm512_broadcast_i64x2
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   return _mm512_broadcast_i64x2(_mm_loadu_si128(__A)); 
 }
 
 __m512i test_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i const* __A) {
-  // CHECK-LABEL: @test_mm512_mask_broadcast_i64x2
+  // CHECK-LABEL: test_mm512_mask_broadcast_i64x2
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_broadcast_i64x2(__O, __M, _mm_loadu_si128(__A)); 
 }
 
 __m512i test_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i const* __A) {
-  // CHECK-LABEL: @test_mm512_maskz_broadcast_i64x2
+  // CHECK-LABEL: test_mm512_maskz_broadcast_i64x2
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_broadcast_i64x2(__M, _mm_loadu_si128(__A)); 
 }
 
 __m256 test_mm512_extractf32x8_ps(__m512 __A) {
-  // CHECK-LABEL: @test_mm512_extractf32x8_ps
+  // CHECK-LABEL: test_mm512_extractf32x8_ps
   // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_extractf32x8_ps(__A, 1); 
 }
 
 __m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_mask_extractf32x8_ps
+  // CHECK-LABEL: test_mm512_mask_extractf32x8_ps
   // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_mask_extractf32x8_ps(__W, __U, __A, 1); 
 }
 
 __m256 test_mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_extractf32x8_ps
+  // CHECK-LABEL: test_mm512_maskz_extractf32x8_ps
   // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_maskz_extractf32x8_ps(__U, __A, 1); 
 }
 
 __m128d test_mm512_extractf64x2_pd(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_extractf64x2_pd
+  // CHECK-LABEL: test_mm512_extractf64x2_pd
   // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <2 x i32> <i32 6, i32 7>
   return _mm512_extractf64x2_pd(__A, 3); 
 }
 
 __m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_extractf64x2_pd
+  // CHECK-LABEL: test_mm512_mask_extractf64x2_pd
   // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <2 x i32> <i32 6, i32 7>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm512_mask_extractf64x2_pd(__W, __U, __A, 3); 
 }
 
 __m128d test_mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_extractf64x2_pd
+  // CHECK-LABEL: test_mm512_maskz_extractf64x2_pd
   // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <2 x i32> <i32 6, i32 7>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm512_maskz_extractf64x2_pd(__U, __A, 3); 
 }
 
 __m256i test_mm512_extracti32x8_epi32(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_extracti32x8_epi32
+  // CHECK-LABEL: test_mm512_extracti32x8_epi32
   // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_extracti32x8_epi32(__A, 1); 
 }
 
 __m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_extracti32x8_epi32
+  // CHECK-LABEL: test_mm512_mask_extracti32x8_epi32
   // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm512_mask_extracti32x8_epi32(__W, __U, __A, 1); 
 }
 
 __m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_extracti32x8_epi32
+  // CHECK-LABEL: test_mm512_maskz_extracti32x8_epi32
   // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm512_maskz_extracti32x8_epi32(__U, __A, 1); 
 }
 
 __m128i test_mm512_extracti64x2_epi64(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_extracti64x2_epi64
+  // CHECK-LABEL: test_mm512_extracti64x2_epi64
   // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <2 x i32> <i32 6, i32 7>
   return _mm512_extracti64x2_epi64(__A, 3); 
 }
 
 __m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_extracti64x2_epi64
+  // CHECK-LABEL: test_mm512_mask_extracti64x2_epi64
   // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <2 x i32> <i32 6, i32 7>
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm512_mask_extracti64x2_epi64(__W, __U, __A, 3); 
 }
 
 __m128i test_mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_extracti64x2_epi64
+  // CHECK-LABEL: test_mm512_maskz_extracti64x2_epi64
   // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <2 x i32> <i32 6, i32 7>
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm512_maskz_extracti64x2_epi64(__U, __A, 3); 
 }
 
 __m512 test_mm512_insertf32x8(__m512 __A, __m256 __B) {
-  // CHECK-LABEL: @test_mm512_insertf32x8
+  // CHECK-LABEL: test_mm512_insertf32x8
   // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   return _mm512_insertf32x8(__A, __B, 1); 
 }
 
 __m512 test_mm512_mask_insertf32x8(__m512 __W, __mmask16 __U, __m512 __A, __m256 __B) {
-  // CHECK-LABEL: @test_mm512_mask_insertf32x8
+  // CHECK-LABEL: test_mm512_mask_insertf32x8
   // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_insertf32x8(__W, __U, __A, __B, 1); 
 }
 
 __m512 test_mm512_maskz_insertf32x8(__mmask16 __U, __m512 __A, __m256 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_insertf32x8
+  // CHECK-LABEL: test_mm512_maskz_insertf32x8
   // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_insertf32x8(__U, __A, __B, 1); 
 }
 
 __m512d test_mm512_insertf64x2(__m512d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm512_insertf64x2
+  // CHECK-LABEL: test_mm512_insertf64x2
   // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
   return _mm512_insertf64x2(__A, __B, 3); 
 }
 
 __m512d test_mm512_mask_insertf64x2(__m512d __W, __mmask8 __U, __m512d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm512_mask_insertf64x2
+  // CHECK-LABEL: test_mm512_mask_insertf64x2
   // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_insertf64x2(__W, __U, __A, __B, 3); 
 }
 
 __m512d test_mm512_maskz_insertf64x2(__mmask8 __U, __m512d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_insertf64x2
+  // CHECK-LABEL: test_mm512_maskz_insertf64x2
   // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_insertf64x2(__U, __A, __B, 3); 
 }
 
 __m512i test_mm512_inserti32x8(__m512i __A, __m256i __B) {
-  // CHECK-LABEL: @test_mm512_inserti32x8
+  // CHECK-LABEL: test_mm512_inserti32x8
   // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   return _mm512_inserti32x8(__A, __B, 1); 
 }
 
 __m512i test_mm512_mask_inserti32x8(__m512i __W, __mmask16 __U, __m512i __A, __m256i __B) {
-  // CHECK-LABEL: @test_mm512_mask_inserti32x8
+  // CHECK-LABEL: test_mm512_mask_inserti32x8
   // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_inserti32x8(__W, __U, __A, __B, 1); 
 }
 
 __m512i test_mm512_maskz_inserti32x8(__mmask16 __U, __m512i __A, __m256i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_inserti32x8
+  // CHECK-LABEL: test_mm512_maskz_inserti32x8
   // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_inserti32x8(__U, __A, __B, 1); 
 }
 
 __m512i test_mm512_inserti64x2(__m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_inserti64x2
+  // CHECK-LABEL: test_mm512_inserti64x2
   // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
   return _mm512_inserti64x2(__A, __B, 1); 
 }
 
 __m512i test_mm512_mask_inserti64x2(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_mask_inserti64x2
+  // CHECK-LABEL: test_mm512_mask_inserti64x2
   // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_inserti64x2(__W, __U, __A, __B, 1); 
 }
 
 __m512i test_mm512_maskz_inserti64x2(__mmask8 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_inserti64x2
+  // CHECK-LABEL: test_mm512_maskz_inserti64x2
   // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_inserti64x2(__U, __A, __B, 1); 
 }
 __mmask8 test_mm512_mask_fpclass_pd_mask(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_fpclass_pd_mask
+  // CHECK-LABEL: test_mm512_mask_fpclass_pd_mask
   // CHECK: @llvm.x86.avx512.fpclass.pd.512
   return _mm512_mask_fpclass_pd_mask(__U, __A, 4); 
 }
 
 __mmask8 test_mm512_fpclass_pd_mask(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_fpclass_pd_mask
+  // CHECK-LABEL: test_mm512_fpclass_pd_mask
   // CHECK: @llvm.x86.avx512.fpclass.pd.512
   return _mm512_fpclass_pd_mask(__A, 4); 
 }
 
 __mmask16 test_mm512_mask_fpclass_ps_mask(__mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_mask_fpclass_ps_mask
+  // CHECK-LABEL: test_mm512_mask_fpclass_ps_mask
   // CHECK: @llvm.x86.avx512.fpclass.ps.512
   return _mm512_mask_fpclass_ps_mask(__U, __A, 4); 
 }
 
 __mmask16 test_mm512_fpclass_ps_mask(__m512 __A) {
-  // CHECK-LABEL: @test_mm512_fpclass_ps_mask
+  // CHECK-LABEL: test_mm512_fpclass_ps_mask
   // CHECK: @llvm.x86.avx512.fpclass.ps.512
   return _mm512_fpclass_ps_mask(__A, 4); 
 }
 
 __mmask8 test_mm_fpclass_sd_mask(__m128d __A)  { 
-  // CHECK-LABEL: @test_mm_fpclass_sd_mask
+  // CHECK-LABEL: test_mm_fpclass_sd_mask
   // CHECK: @llvm.x86.avx512.mask.fpclass.sd
  return _mm_fpclass_sd_mask (__A, 2);
 }
 
 __mmask8 test_mm_mask_fpclass_sd_mask(__mmask8 __U, __m128d __A)  {
- // CHECK-LABEL: @test_mm_mask_fpclass_sd_mask
+ // CHECK-LABEL: test_mm_mask_fpclass_sd_mask
  // CHECK: @llvm.x86.avx512.mask.fpclass.sd
  return _mm_mask_fpclass_sd_mask (__U,  __A, 2);
 }
 
 __mmask8 test_mm_fpclass_ss_mask(__m128 __A)  { 
- // CHECK-LABEL: @test_mm_fpclass_ss_mask
+ // CHECK-LABEL: test_mm_fpclass_ss_mask
  // CHECK: @llvm.x86.avx512.mask.fpclass.ss
  return _mm_fpclass_ss_mask ( __A, 2);
 }
 
 __mmask8 test_mm_mask_fpclass_ss_mask(__mmask8 __U, __m128 __A)  {
- // CHECK-LABEL: @test_mm_mask_fpclass_ss_mask
+ // CHECK-LABEL: test_mm_mask_fpclass_ss_mask
  // CHECK: @llvm.x86.avx512.mask.fpclass.ss
  return _mm_mask_fpclass_ss_mask (__U, __A, 2);
 }
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 5447035..8c14c57 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -154,6 +154,7 @@ __m512 test_mm512_add_ps(__m512 a, __m512 b)
   // CHECK: fadd <16 x float>
   return _mm512_add_ps(a, b);
 }
+TEST_CONSTEXPR(match_m512(_mm512_add_ps((__m512){-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f, +1.0f, +2.0f, +3.0f, +4.0f, +5.0f, +6.0f, +7.0f, +8.0f}, (__m512){-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f, +1.0f, +2.0f, +3.0f, +4.0f, +5.0f, +6.0f, +7.0f, +8.0f}), -2.0f, -4.0f, -6.0f, -8.0f, -10.0f, -12.0f, -14.0f, -16.0f, +2.0f, +4.0f, +6.0f, +8.0f, +10.0f, +12.0f, +14.0f, +16.0f));
 
 __m512d test_mm512_add_pd(__m512d a, __m512d b)
 {
@@ -161,6 +162,7 @@ __m512d test_mm512_add_pd(__m512d a, __m512d b)
   // CHECK: fadd <8 x double>
   return _mm512_add_pd(a, b);
 }
+TEST_CONSTEXPR(match_m512d(_mm512_add_pd((__m512d){-1.0, -2.0, -3.0, -4.0, +1.0, +2.0, +3.0, +4.0}, (__m512d){-1.0, -2.0, -3.0, -4.0, +1.0, +2.0, +3.0, +4.0}), -2.0, -4.0, -6.0, -8.0, +2.0, +4.0, +6.0, +8.0));
 
 __m512 test_mm512_mul_ps(__m512 a, __m512 b)
 {
@@ -168,6 +170,7 @@ __m512 test_mm512_mul_ps(__m512 a, __m512 b)
   // CHECK: fmul <16 x float>
   return _mm512_mul_ps(a, b);
 }
+TEST_CONSTEXPR(match_m512(_mm512_mul_ps((__m512){-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f, +1.0f, +2.0f, +3.0f, +4.0f, +5.0f, +6.0f, +7.0f, +8.0f}, (__m512){-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f, +1.0f, +2.0f, +3.0f, +4.0f, +5.0f, +6.0f, +7.0f, +8.0f}), +1.0f, +4.0f, +9.0f, +16.0f, +25.0f, +36.0f, +49.0f, +64.0f, +1.0f, +4.0f, +9.0f, +16.0f, +25.0f, +36.0f, +49.0f, +64.0f));
 
 __m512d test_mm512_mul_pd(__m512d a, __m512d b)
 {
@@ -175,6 +178,7 @@ __m512d test_mm512_mul_pd(__m512d a, __m512d b)
   // CHECK: fmul <8 x double>
   return _mm512_mul_pd(a, b);
 }
+TEST_CONSTEXPR(match_m512d(_mm512_mul_pd((__m512d){-1.0, -2.0, -3.0, -4.0, +1.0, +2.0, +3.0, +4.0}, (__m512d){-1.0, -2.0, -3.0, -4.0, +1.0, +2.0, +3.0, +4.0}), +1.0, +4.0, +9.0, +16.0, +1.0, +4.0, +9.0, +16.0));
 
 void test_mm512_storeu_si512 (void *__P, __m512i __A)
 {
@@ -1261,6 +1265,7 @@ __m512d test_mm512_unpackhi_pd(__m512d a, __m512d b)
   // CHECK: shufflevector <8 x double> {{.*}} <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   return _mm512_unpackhi_pd(a, b);
 }
+TEST_CONSTEXPR(match_m512d(_mm512_unpackhi_pd((__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, (__m512d){9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0}), +2.0, +10.0, +4.0, +12.0, +6.0, +14.0, +8.0, +16.0));
 
 __m512d test_mm512_unpacklo_pd(__m512d a, __m512d b)
 {
@@ -1268,6 +1273,7 @@ __m512d test_mm512_unpacklo_pd(__m512d a, __m512d b)
   // CHECK: shufflevector <8 x double> {{.*}} <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   return _mm512_unpacklo_pd(a, b);
 }
+TEST_CONSTEXPR(match_m512d(_mm512_unpacklo_pd((__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, (__m512d){9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0}), +1.0, +9.0, +3.0, +11.0, +5.0, +13.0, +7.0, +15.0));
 
 __m512 test_mm512_unpackhi_ps(__m512 a, __m512 b)
 {
@@ -1275,6 +1281,7 @@ __m512 test_mm512_unpackhi_ps(__m512 a, __m512 b)
   // CHECK: shufflevector <16 x float> {{.*}} <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   return _mm512_unpackhi_ps(a, b);
 }
+TEST_CONSTEXPR(match_m512(_mm512_unpackhi_ps((__m512){0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}, (__m512){16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f}), +2.0f, +18.0f, +3.0f, +19.0f, +6.0f, +22.0f, +7.0f, +23.0f, +10.0f, +26.0f, +11.0f, +27.0f, +14.0f, +30.0f, +15.0f, +31.0f));
 
 __m512 test_mm512_unpacklo_ps(__m512 a, __m512 b)
 {
@@ -1282,6 +1289,7 @@ __m512 test_mm512_unpacklo_ps(__m512 a, __m512 b)
   // CHECK: shufflevector <16 x float> {{.*}} <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   return _mm512_unpacklo_ps(a, b);
 }
+TEST_CONSTEXPR(match_m512(_mm512_unpacklo_ps((__m512){0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}, (__m512){16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f}), +0.0f, +16.0f, +1.0f, +17.0f, +4.0f, +20.0f, +5.0f, +21.0f, +8.0f, +24.0f, +9.0f, +25.0f, +12.0f, +28.0f, +13.0f, +29.0f));
 
 __mmask16 test_mm512_cmp_round_ps_mask(__m512 a, __m512 b) {
   // CHECK-LABEL: test_mm512_cmp_round_ps_mask
@@ -3551,6 +3559,7 @@ __m512d test_mm512_div_pd(__m512d __a, __m512d __b) {
   // CHECK: fdiv <8 x double>
   return _mm512_div_pd(__a,__b); 
 }
+TEST_CONSTEXPR(match_m512d(_mm512_div_pd((__m512d){+8.0, +6.0, +4.0, +2.0, -8.0, -6.0, -4.0, -2.0}, (__m512d){+2.0, +2.0, +2.0, +2.0, -2.0, -2.0, -2.0, -2.0}), +4.0, +3.0, +2.0, +1.0, +4.0, +3.0, +2.0, +1.0));
 __m512d test_mm512_mask_div_pd(__m512d __w, __mmask8 __u, __m512d __a, __m512d __b) {
   // CHECK-LABEL: test_mm512_mask_div_pd
   // CHECK: fdiv <8 x double> %{{.*}}, %{{.*}}
@@ -3585,6 +3594,7 @@ __m512 test_mm512_div_ps(__m512 __A, __m512 __B) {
   // CHECK: fdiv <16 x float>
   return _mm512_div_ps(__A,__B); 
 }
+TEST_CONSTEXPR(match_m512(_mm512_div_ps((__m512){+16.0f, +14.0f, +12.0f, +10.0f, +8.0f, +6.0f, +4.0f, +2.0f, -16.0f, -14.0f, -12.0f, -10.0f, -8.0f, -6.0f, -4.0f, -2.0f}, (__m512){+2.0f, +2.0f, +2.0f, +2.0f, +2.0f, +2.0f, +2.0f, +2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f}), +8.0f, +7.0f, +6.0f, +5.0f, +4.0f, +3.0f, +2.0f, +1.0f, +8.0f, +7.0f, +6.0f, +5.0f, +4.0f, +3.0f, +2.0f, +1.0f));
 __m512 test_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: test_mm512_mask_div_ps
   // CHECK: fdiv <16 x float> %{{.*}}, %{{.*}}
diff --git a/clang/test/CodeGen/X86/avx512vpopcntdq-builtins.c b/clang/test/CodeGen/X86/avx512vpopcntdq-builtins.c
index ca8f5e4..8927ae2 100644
--- a/clang/test/CodeGen/X86/avx512vpopcntdq-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vpopcntdq-builtins.c
@@ -1,45 +1,48 @@
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vpopcntdq -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vpopcntdq -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vpopcntdq -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vpopcntdq -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vpopcntdq -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 #include <immintrin.h>
 #include "builtin_test_helpers.h"
 
 __m512i test_mm512_popcnt_epi64(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_popcnt_epi64
+  // CHECK-LABEL: test_mm512_popcnt_epi64
   // CHECK: @llvm.ctpop.v8i64
   return _mm512_popcnt_epi64(__A);
 }
-TEST_CONSTEXPR(match_v8di(_mm512_popcnt_epi64((__m512i)(__v8di){+5, -3, -10, +8, 0, -256, +256, -128}), 2, 31, 30, 1, 0, 24, 1, 25));
+TEST_CONSTEXPR(match_v8di(_mm512_popcnt_epi64((__m512i)(__v8di){+5, -3, -10, +8, 0, -256, +256, -128}), 2, 63, 62, 1, 0, 56, 1, 57));
 
 __m512i test_mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_popcnt_epi64
+  // CHECK-LABEL: test_mm512_mask_popcnt_epi64
   // CHECK: @llvm.ctpop.v8i64
   // CHECK: select <8 x i1> %{{[0-9]+}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_popcnt_epi64(__W, __U, __A);
 }
 
 __m512i test_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_popcnt_epi64
+  // CHECK-LABEL: test_mm512_maskz_popcnt_epi64
   // CHECK: @llvm.ctpop.v8i64
   // CHECK: select <8 x i1> %{{[0-9]+}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_popcnt_epi64(__U, __A);
 }
 
 __m512i test_mm512_popcnt_epi32(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_popcnt_epi32
+  // CHECK-LABEL: test_mm512_popcnt_epi32
   // CHECK: @llvm.ctpop.v16i32
   return _mm512_popcnt_epi32(__A);
 }
 TEST_CONSTEXPR(match_v16si(_mm512_popcnt_epi32((__m512i)(__v16si){+5, -3, -10, +8, 0, -256, +256, -128, +3, +9, +15, +33, +63, +129, +511, +1025}), 2, 31, 30, 1, 0, 24, 1, 25, 2, 2, 4, 2, 6, 2, 9, 2));
 
 __m512i test_mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_popcnt_epi32
+  // CHECK-LABEL: test_mm512_mask_popcnt_epi32
   // CHECK: @llvm.ctpop.v16i32
   // CHECK: select <16 x i1> %{{[0-9]+}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_popcnt_epi32(__W, __U, __A);
 }
 
 __m512i test_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_popcnt_epi32
+  // CHECK-LABEL: test_mm512_maskz_popcnt_epi32
   // CHECK: @llvm.ctpop.v16i32
   // CHECK: select <16 x i1> %{{[0-9]+}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_popcnt_epi32(__U, __A);
diff --git a/clang/test/CodeGen/X86/avx512vpopcntdqvl-builtins.c b/clang/test/CodeGen/X86/avx512vpopcntdqvl-builtins.c
index 5d18b68..d9fbd76 100644
--- a/clang/test/CodeGen/X86/avx512vpopcntdqvl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vpopcntdqvl-builtins.c
@@ -1,87 +1,90 @@
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vpopcntdq -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vpopcntdq -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vpopcntdq -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vpopcntdq -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vpopcntdq -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 #include <immintrin.h>
 #include "builtin_test_helpers.h"
 
 __m128i test_mm_popcnt_epi64(__m128i __A) {
-  // CHECK-LABEL: @test_mm_popcnt_epi64
+  // CHECK-LABEL: test_mm_popcnt_epi64
   // CHECK: @llvm.ctpop.v2i64
   return _mm_popcnt_epi64(__A);
 }
 TEST_CONSTEXPR(match_v2di(_mm_popcnt_epi64((__m128i)(__v2di){+5, -3}), 2, 63));
 
 __m128i test_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_mask_popcnt_epi64
+  // CHECK-LABEL: test_mm_mask_popcnt_epi64
   // CHECK: @llvm.ctpop.v2i64
   // CHECK: select <2 x i1> %{{.+}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_popcnt_epi64(__W, __U, __A);
 }
 
 __m128i test_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_maskz_popcnt_epi64
+  // CHECK-LABEL: test_mm_maskz_popcnt_epi64
   // CHECK: @llvm.ctpop.v2i64
   // CHECK: select <2 x i1> %{{.+}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_popcnt_epi64(__U, __A);
 }
 
 __m128i test_mm_popcnt_epi32(__m128i __A) {
-  // CHECK-LABEL: @test_mm_popcnt_epi32
+  // CHECK-LABEL: test_mm_popcnt_epi32
   // CHECK: @llvm.ctpop.v4i32
   return _mm_popcnt_epi32(__A);
 }
 TEST_CONSTEXPR(match_v4si(_mm_popcnt_epi32((__m128i)(__v4si){+5, -3, -10, +8}), 2, 31, 30, 1));
 
 __m128i test_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_mask_popcnt_epi32
+  // CHECK-LABEL: test_mm_mask_popcnt_epi32
   // CHECK: @llvm.ctpop.v4i32
   // CHECK: select <4 x i1> %{{.+}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_popcnt_epi32(__W, __U, __A);
 }
 
 __m128i test_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_maskz_popcnt_epi32
+  // CHECK-LABEL: test_mm_maskz_popcnt_epi32
   // CHECK: @llvm.ctpop.v4i32
   // CHECK: select <4 x i1> %{{.+}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_popcnt_epi32(__U, __A);
 }
 
 __m256i test_mm256_popcnt_epi64(__m256i __A) {
-  // CHECK-LABEL: @test_mm256_popcnt_epi64
+  // CHECK-LABEL: test_mm256_popcnt_epi64
   // CHECK: @llvm.ctpop.v4i64
   return _mm256_popcnt_epi64(__A);
 }
 TEST_CONSTEXPR(match_v4di(_mm256_popcnt_epi64((__m256i)(__v4di){+5, -3, -10, +8}), 2, 63, 62, 1));
 
 __m256i test_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm256_mask_popcnt_epi64
+  // CHECK-LABEL: test_mm256_mask_popcnt_epi64
   // CHECK: @llvm.ctpop.v4i64
   // CHECK: select <4 x i1> %{{.+}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_popcnt_epi64(__W, __U, __A);
 }
 
 __m256i test_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm256_maskz_popcnt_epi64
+  // CHECK-LABEL: test_mm256_maskz_popcnt_epi64
   // CHECK: @llvm.ctpop.v4i64
   // CHECK: select <4 x i1> %{{.+}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_popcnt_epi64(__U, __A);
 }
 
 __m256i test_mm256_popcnt_epi32(__m256i __A) {
-  // CHECK-LABEL: @test_mm256_popcnt_epi32
+  // CHECK-LABEL: test_mm256_popcnt_epi32
   // CHECK: @llvm.ctpop.v8i32
   return _mm256_popcnt_epi32(__A);
 }
 TEST_CONSTEXPR(match_v8si(_mm256_popcnt_epi32((__m256i)(__v8si){+5, -3, -10, +8, 0, -256, +256, -128}), 2, 31, 30, 1, 0, 24, 1, 25));
 
 __m256i test_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm256_mask_popcnt_epi32
+  // CHECK-LABEL: test_mm256_mask_popcnt_epi32
   // CHECK: @llvm.ctpop.v8i32
   // CHECK: select <8 x i1> %{{.+}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_popcnt_epi32(__W, __U, __A);
 }
 
 __m256i test_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm256_maskz_popcnt_epi32
+  // CHECK-LABEL: test_mm256_maskz_popcnt_epi32
   // CHECK: @llvm.ctpop.v8i32
   // CHECK: select <8 x i1> %{{.+}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_popcnt_epi32(__U, __A);
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index f201dfe..375664b 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -751,24 +751,3 @@ void *tp (void) {
   return __builtin_thread_pointer ();
   // WEBASSEMBLY: call {{.*}} @llvm.thread.pointer.p0()
 }
-
-typedef void (*Fvoid)(void);
-typedef float (*Ffloats)(float, double, int);
-typedef void (*Fpointers)(Fvoid, Ffloats, void*, int*, int***, char[5]);
-
-void use(int);
-
-void test_function_pointer_signature_void(Fvoid func) {
-  // WEBASSEMBLY:  %0 = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, token poison)
-  use(__builtin_wasm_test_function_pointer_signature(func));
-}
-
-void test_function_pointer_signature_floats(Ffloats func) {
-  // WEBASSEMBLY:  tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, float 0.000000e+00, token poison, float 0.000000e+00, double 0.000000e+00, i32 0)
-  use(__builtin_wasm_test_function_pointer_signature(func));
-}
-
-void test_function_pointer_signature_pointers(Fpointers func) {
-  // WEBASSEMBLY:  %0 = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, token poison, ptr null, ptr null, ptr null, ptr null, ptr null, ptr null)
-  use(__builtin_wasm_test_function_pointer_signature(func));
-}
diff --git a/clang/test/Headers/__cpuidex_conflict.c b/clang/test/Headers/__cpuidex_conflict.c
index 74f4532..d14ef29 100644
--- a/clang/test/Headers/__cpuidex_conflict.c
+++ b/clang/test/Headers/__cpuidex_conflict.c
@@ -5,6 +5,7 @@
 
 // Ensure that we do not run into conflicts when offloading.
 // RUN: %clang_cc1 %s -DIS_STATIC=static -ffreestanding -fopenmp -fopenmp-is-target-device -aux-triple x86_64-unknown-linux-gnu
+// RUN: %clang_cc1 -DIS_STATIC="" -triple nvptx64-nvidia-cuda -aux-triple x86_64-unknown-linux-gnu -aux-target-cpu x86-64 -fcuda-is-device -internal-isystem /home/gha/llvm-project/build/lib/clang/22/include -x cuda %s -o -
 
 typedef __SIZE_TYPE__ size_t;
 
diff --git a/clang/test/Sema/builtins-wasm.c b/clang/test/Sema/builtins-wasm.c
index a3486b1..9075e9e 100644
--- a/clang/test/Sema/builtins-wasm.c
+++ b/clang/test/Sema/builtins-wasm.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -triple wasm32 -target-feature +reference-types %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple wasm32 -target-abi experimental-mv -DMULTIVALUE -target-feature +reference-types %s
 
 #define EXPR_HAS_TYPE(expr, type) _Generic((expr), type : 1, default : 0)
 
@@ -57,8 +58,8 @@ void test_table_copy(int dst_idx, int src_idx, int nelem) {
 
 typedef void (*F1)(void);
 typedef int (*F2)(int);
-typedef int (*F3)(__externref_t);
-typedef __externref_t (*F4)(int);
+typedef void (*F3)(struct {int x; double y;});
+typedef struct {int x; double y;} (*F4)(void);
 
 void test_function_pointer_signature() {
   // Test argument count validation
@@ -68,8 +69,6 @@ void test_function_pointer_signature() {
   // // Test argument type validation - should require function pointer
   (void)__builtin_wasm_test_function_pointer_signature((void*)0); // expected-error {{used type 'void *' where function pointer is required}}
   (void)__builtin_wasm_test_function_pointer_signature((int)0);   // expected-error {{used type 'int' where function pointer is required}}
-  (void)__builtin_wasm_test_function_pointer_signature((F3)0);   // expected-error {{not supported for function pointers with a reference type parameter}}
-  (void)__builtin_wasm_test_function_pointer_signature((F4)0);   // expected-error {{not supported for function pointers with a reference type return value}}
 
   // // Test valid usage
   int res = __builtin_wasm_test_function_pointer_signature((F1)0);
@@ -77,4 +76,14 @@ void test_function_pointer_signature() {
 
   // Test return type
   _Static_assert(EXPR_HAS_TYPE(__builtin_wasm_test_function_pointer_signature((F1)0), int), "");
+
+#ifdef MULTIVALUE
+  // Test that struct arguments and returns are rejected with multivalue abi
+  (void)__builtin_wasm_test_function_pointer_signature((F3)0); // expected-error {{not supported with the multivalue ABI for function pointers with a struct/union as parameter}}
+  (void)__builtin_wasm_test_function_pointer_signature((F4)0); // expected-error {{not supported with the multivalue ABI for function pointers with a struct/union as return value}}
+#else
+  // with default abi they are fine
+  (void)__builtin_wasm_test_function_pointer_signature((F3)0);
+  (void)__builtin_wasm_test_function_pointer_signature((F4)0);
+#endif
 }
diff --git a/compiler-rt/lib/builtins/aarch64/lse.S b/compiler-rt/lib/builtins/aarch64/lse.S
index d7c1db7..a444d82 100644
--- a/compiler-rt/lib/builtins/aarch64/lse.S
+++ b/compiler-rt/lib/builtins/aarch64/lse.S
@@ -264,7 +264,7 @@ END_COMPILERRT_OUTLINE_FUNCTION(NAME(LDNM))
 
 NO_EXEC_STACK_DIRECTIVE
 
-// GNU property note for BTI and PAC
-GNU_PROPERTY_BTI_PAC
+// GNU property note for BTI, PAC, and GCS
+GNU_PROPERTY_BTI_PAC_GCS
 
 #endif // defined(__aarch64__) || defined(__arm64ec__)
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index 7c47336..d5510ac 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -371,5 +371,5 @@ END_COMPILERRT_FUNCTION(__arm_sme_restore)
 
 NO_EXEC_STACK_DIRECTIVE
 
-// GNU property note for BTI and PAC
-GNU_PROPERTY_BTI_PAC
+// GNU property note for BTI, PAC, and GCS
+GNU_PROPERTY_BTI_PAC_GCS
diff --git a/compiler-rt/lib/builtins/assembly.h b/compiler-rt/lib/builtins/assembly.h
index 89372f1..d7db7d8 100644
--- a/compiler-rt/lib/builtins/assembly.h
+++ b/compiler-rt/lib/builtins/assembly.h
@@ -79,11 +79,12 @@
 #define FUNC_ALIGN
 #endif
 
-// BTI and PAC gnu property note
+// BTI, PAC, and GCS gnu property note
 #define NT_GNU_PROPERTY_TYPE_0 5
 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI 1
 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC 2
+#define GNU_PROPERTY_AARCH64_FEATURE_1_GCS 4
 
 #if defined(__ARM_FEATURE_BTI_DEFAULT)
 #define BTI_FLAG GNU_PROPERTY_AARCH64_FEATURE_1_BTI
@@ -97,6 +98,12 @@
 #define PAC_FLAG 0
 #endif
 
+#if defined(__ARM_FEATURE_GCS_DEFAULT)
+#define GCS_FLAG GNU_PROPERTY_AARCH64_FEATURE_1_GCS
+#else
+#define GCS_FLAG 0
+#endif
+
 #define GNU_PROPERTY(type, value)                                              \
   .pushsection .note.gnu.property, "a" SEPARATOR                               \
   .p2align 3 SEPARATOR                                                         \
@@ -118,11 +125,12 @@
 #define BTI_J
 #endif
 
-#if (BTI_FLAG | PAC_FLAG) != 0
-#define GNU_PROPERTY_BTI_PAC                                                   \
-  GNU_PROPERTY(GNU_PROPERTY_AARCH64_FEATURE_1_AND, BTI_FLAG | PAC_FLAG)
+#if (BTI_FLAG | PAC_FLAG | GCS_FLAG) != 0
+#define GNU_PROPERTY_BTI_PAC_GCS                                               \
+  GNU_PROPERTY(GNU_PROPERTY_AARCH64_FEATURE_1_AND,                             \
+               BTI_FLAG | PAC_FLAG | GCS_FLAG)
 #else
-#define GNU_PROPERTY_BTI_PAC
+#define GNU_PROPERTY_BTI_PAC_GCS
 #endif
 
 #if defined(__clang__) || defined(__GCC_HAVE_DWARF2_CFI_ASM)
diff --git a/compiler-rt/lib/hwasan/hwasan_interceptors_vfork.S b/compiler-rt/lib/hwasan/hwasan_interceptors_vfork.S
index fd20825e..825f411 100644
--- a/compiler-rt/lib/hwasan/hwasan_interceptors_vfork.S
+++ b/compiler-rt/lib/hwasan/hwasan_interceptors_vfork.S
@@ -11,4 +11,4 @@
 
 NO_EXEC_STACK_DIRECTIVE
 
-GNU_PROPERTY_BTI_PAC
+GNU_PROPERTY_BTI_PAC_GCS
diff --git a/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S b/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S
index 0c0abb6..b8d98b0 100644
--- a/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S
+++ b/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S
@@ -99,4 +99,4 @@ ASM_TRAMPOLINE_ALIAS(_setjmp, setjmp)
 // We do not need executable stack.
 NO_EXEC_STACK_DIRECTIVE
 
-GNU_PROPERTY_BTI_PAC
+GNU_PROPERTY_BTI_PAC_GCS
diff --git a/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S b/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S
index fd060c5..be82475 100644
--- a/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S
+++ b/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S
@@ -157,4 +157,4 @@ mismatch:
 // We do not need executable stack.
 NO_EXEC_STACK_DIRECTIVE
 
-GNU_PROPERTY_BTI_PAC
+GNU_PROPERTY_BTI_PAC_GCS
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S
index cdfa6f1..5066953 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S
@@ -43,6 +43,6 @@ ASM_SIZE(vfork)
 ASM_INTERCEPTOR_TRAMPOLINE(vfork)
 ASM_TRAMPOLINE_ALIAS(vfork, vfork)
 
-GNU_PROPERTY_BTI_PAC
+GNU_PROPERTY_BTI_PAC_GCS
 
 #endif
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S b/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S
index 7d920be..f1d11a3 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S
@@ -222,6 +222,6 @@ ASM_SIZE(ASM_SYMBOL_INTERCEPTOR(__sigsetjmp))
 
 NO_EXEC_STACK_DIRECTIVE
 
-GNU_PROPERTY_BTI_PAC
+GNU_PROPERTY_BTI_PAC_GCS
 
 #endif
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index 332c087..dc2db1d 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -251,19 +251,33 @@ else()
   add_win_flangrt_runtime(STATIC dynamic     MultiThreadedDLL      INSTALL_WITH_TOOLCHAIN)
   add_win_flangrt_runtime(STATIC dynamic_dbg MultiThreadedDebugDLL INSTALL_WITH_TOOLCHAIN)
 
-  # Unittests link against LLVMSupport which is using CMake's default runtime
-  # library selection, which is either MultiThreadedDLL or MultiThreadedDebugDLL
-  # depending on the configuration. They have to match or linking will fail.
+  # Unittests link against LLVMSupport. If CMAKE_MSVC_RUNTIME_LIBRARY is set,
+  # that will have been used for LLVMSupport so it must also be used here.
+  # Otherwise this will use CMake's default runtime library selection, which
+  # is either MultiThreadedDLL or MultiThreadedDebugDLL depending on the configuration.
+  # They have to match or linking will fail.
   if (GENERATOR_IS_MULTI_CONFIG)
     # We cannot select an ALIAS library because it may be different
     # per configuration. Fallback to CMake's default.
     add_win_flangrt_runtime(STATIC unittest "" EXCLUDE_FROM_ALL)
   else ()
-    string(TOLOWER ${CMAKE_BUILD_TYPE} build_type)
-    if (build_type STREQUAL "debug")
-      add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic_dbg)
-    else ()
-      add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic)
-    endif ()
+    # Check if CMAKE_MSVC_RUNTIME_LIBRARY was set.
+    if (CMAKE_MSVC_RUNTIME_LIBRARY STREQUAL "MultiThreaded")
+        add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.static)
+    elseif (CMAKE_MSVC_RUNTIME_LIBRARY STREQUAL "MultiThreadedDLL")
+        add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic)
+    elseif (CMAKE_MSVC_RUNTIME_LIBRARY STREQUAL "MultiThreadedDebug")
+        add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.static_dbg)
+    elseif (CMAKE_MSVC_RUNTIME_LIBRARY STREQUAL "MultiThreadedDebugDLL")
+        add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic_dbg)
+    else()
+      # Default based on the build type.
+      string(TOLOWER ${CMAKE_BUILD_TYPE} build_type)
+      if (build_type STREQUAL "debug")
+          add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic_dbg)
+      else ()
+          add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic)
+      endif ()
+    endif()
   endif ()
 endif()
diff --git a/flang-rt/unittests/CMakeLists.txt b/flang-rt/unittests/CMakeLists.txt
index 831bc8a..fd63ad1 100644
--- a/flang-rt/unittests/CMakeLists.txt
+++ b/flang-rt/unittests/CMakeLists.txt
@@ -94,14 +94,6 @@ function(add_flangrt_unittest test_dirname)
   target_link_libraries(${test_dirname} PRIVATE ${ARG_LINK_LIBS})
   add_flangrt_unittest_offload_properties(${test_dirname})
   add_flangrt_dependent_libs(${test_dirname})
-
-  # Required because LLVMSupport is compiled with this option.
-  # FIXME: According to CMake documentation, this is the default. Why is it
-  #        needed? LLVM's add_unittest doesn't set it either.
-  set_target_properties(${test_dirname}
-      PROPERTIES
-        MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL"
-    )
 endfunction()
 
 function(add_flangrt_nongtest_unittest test_name)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 8302e40..e72190f 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4986,9 +4986,9 @@ struct OmpEndCriticalDirective {
   CharBlock source;
   std::tuple<Verbatim, std::optional<Name>> t;
 };
-struct OpenMPCriticalConstruct {
-  TUPLE_CLASS_BOILERPLATE(OpenMPCriticalConstruct);
-  std::tuple<OmpCriticalDirective, Block, OmpEndCriticalDirective> t;
+
+struct OpenMPCriticalConstruct : public OmpBlockConstruct {
+  INHERITED_TUPLE_CLASS_BOILERPLATE(OpenMPCriticalConstruct, OmpBlockConstruct);
 };
 
 // 2.11.3 allocate -> ALLOCATE [(variable-name-list)] [clause]
diff --git a/flang/lib/Semantics/openmp-utils.h b/flang/include/flang/Semantics/openmp-utils.h
index b8ad9ed..b8ad9ed 100644
--- a/flang/lib/Semantics/openmp-utils.h
+++ b/flang/include/flang/Semantics/openmp-utils.h
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index db6a0e2..4ce9a0e 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -34,6 +34,7 @@
 #include "flang/Parser/openmp-utils.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/openmp-directive-sets.h"
+#include "flang/Semantics/openmp-utils.h"
 #include "flang/Semantics/tools.h"
 #include "flang/Support/Flags.h"
 #include "flang/Support/OpenMP-utils.h"
@@ -3820,18 +3821,29 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPCriticalConstruct &criticalConstruct) {
-  const auto &cd = std::get<parser::OmpCriticalDirective>(criticalConstruct.t);
-  List<Clause> clauses =
-      makeClauses(std::get<parser::OmpClauseList>(cd.t), semaCtx);
+  const parser::OmpDirectiveSpecification &beginSpec =
+      criticalConstruct.BeginDir();
+  List<Clause> clauses = makeClauses(beginSpec.Clauses(), semaCtx);
 
   ConstructQueue queue{buildConstructQueue(
-      converter.getFirOpBuilder().getModule(), semaCtx, eval, cd.source,
+      converter.getFirOpBuilder().getModule(), semaCtx, eval, beginSpec.source,
       llvm::omp::Directive::OMPD_critical, clauses)};
 
-  const auto &name = std::get<std::optional<parser::Name>>(cd.t);
+  std::optional<parser::Name> critName;
+  const parser::OmpArgumentList &args = beginSpec.Arguments();
+  if (!args.v.empty()) {
+    // All of these things should be guaranteed to exist after semantic checks.
+    auto *object = parser::Unwrap<parser::OmpObject>(args.v.front());
+    assert(object && "Expecting object as argument");
+    auto *designator = semantics::omp::GetDesignatorFromObj(*object);
+    assert(designator && "Expecting desginator in argument");
+    auto *name = semantics::getDesignatorNameIfDataRef(*designator);
+    assert(name && "Expecting dataref in designator");
+    critName = *name;
+  }
   mlir::Location currentLocation = converter.getCurrentLocation();
   genCriticalOp(converter, symTable, semaCtx, eval, currentLocation, queue,
-                queue.begin(), name);
+                queue.begin(), critName);
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 84d1e81..46b1486 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -1758,17 +1758,8 @@ TYPE_PARSER(sourced(construct<OpenMPDeclareMapperConstruct>(
 TYPE_PARSER(construct<OmpReductionCombiner>(Parser<AssignmentStmt>{}) ||
     construct<OmpReductionCombiner>(Parser<FunctionReference>{}))
 
-// 2.13.2 OMP CRITICAL
-TYPE_PARSER(startOmpLine >>
-    sourced(construct<OmpEndCriticalDirective>(
-        verbatim("END CRITICAL"_tok), maybe(parenthesized(name)))) /
-        endOmpLine)
-TYPE_PARSER(sourced(construct<OmpCriticalDirective>(verbatim("CRITICAL"_tok),
-                maybe(parenthesized(name)), Parser<OmpClauseList>{})) /
-    endOmpLine)
-
 TYPE_PARSER(construct<OpenMPCriticalConstruct>(
-    Parser<OmpCriticalDirective>{}, block, Parser<OmpEndCriticalDirective>{}))
+    OmpBlockConstructParser{llvm::omp::Directive::OMPD_critical}))
 
 // 2.11.3 Executable Allocate directive
 TYPE_PARSER(
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 46141e2..4f8d498 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2606,9 +2606,7 @@ public:
     EndOpenMP();
   }
   void Unparse(const OpenMPCriticalConstruct &x) {
-    Walk(std::get<OmpCriticalDirective>(x.t));
-    Walk(std::get<Block>(x.t), "");
-    Walk(std::get<OmpEndCriticalDirective>(x.t));
+    Unparse(static_cast<const OmpBlockConstruct &>(x));
   }
   void Unparse(const OmpDeclareTargetWithList &x) {
     Put("("), Walk(x.v), Put(")");
diff --git a/flang/lib/Semantics/check-omp-atomic.cpp b/flang/lib/Semantics/check-omp-atomic.cpp
index a5fdabf..fcb0f9a 100644
--- a/flang/lib/Semantics/check-omp-atomic.cpp
+++ b/flang/lib/Semantics/check-omp-atomic.cpp
@@ -11,13 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "check-omp-structure.h"
-#include "openmp-utils.h"
 
 #include "flang/Common/indirection.h"
 #include "flang/Evaluate/expression.h"
 #include "flang/Evaluate/tools.h"
 #include "flang/Parser/char-block.h"
 #include "flang/Parser/parse-tree.h"
+#include "flang/Semantics/openmp-utils.h"
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
 #include "flang/Semantics/type.h"
diff --git a/flang/lib/Semantics/check-omp-loop.cpp b/flang/lib/Semantics/check-omp-loop.cpp
index 59d57a2..8dad1f5 100644
--- a/flang/lib/Semantics/check-omp-loop.cpp
+++ b/flang/lib/Semantics/check-omp-loop.cpp
@@ -13,7 +13,6 @@
 #include "check-omp-structure.h"
 
 #include "check-directive-structure.h"
-#include "openmp-utils.h"
 
 #include "flang/Common/idioms.h"
 #include "flang/Common/visit.h"
@@ -23,6 +22,7 @@
 #include "flang/Parser/parse-tree.h"
 #include "flang/Parser/tools.h"
 #include "flang/Semantics/openmp-modifiers.h"
+#include "flang/Semantics/openmp-utils.h"
 #include "flang/Semantics/semantics.h"
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
diff --git a/flang/lib/Semantics/check-omp-metadirective.cpp b/flang/lib/Semantics/check-omp-metadirective.cpp
index 03487da..cf5ea90 100644
--- a/flang/lib/Semantics/check-omp-metadirective.cpp
+++ b/flang/lib/Semantics/check-omp-metadirective.cpp
@@ -12,8 +12,6 @@
 
 #include "check-omp-structure.h"
 
-#include "openmp-utils.h"
-
 #include "flang/Common/idioms.h"
 #include "flang/Common/indirection.h"
 #include "flang/Common/visit.h"
@@ -21,6 +19,7 @@
 #include "flang/Parser/message.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/openmp-modifiers.h"
+#include "flang/Semantics/openmp-utils.h"
 #include "flang/Semantics/tools.h"
 
 #include "llvm/Frontend/OpenMP/OMP.h"
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index a9c56c3..cbe6b2c 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -10,7 +10,6 @@
 
 #include "check-directive-structure.h"
 #include "definable.h"
-#include "openmp-utils.h"
 #include "resolve-names-utils.h"
 
 #include "flang/Common/idioms.h"
@@ -27,6 +26,7 @@
 #include "flang/Semantics/expression.h"
 #include "flang/Semantics/openmp-directive-sets.h"
 #include "flang/Semantics/openmp-modifiers.h"
+#include "flang/Semantics/openmp-utils.h"
 #include "flang/Semantics/scope.h"
 #include "flang/Semantics/semantics.h"
 #include "flang/Semantics/symbol.h"
@@ -537,14 +537,6 @@ template <typename Checker> struct DirectiveSpellingVisitor {
     checker_(x.v.source, Directive::OMPD_assume);
     return false;
   }
-  bool Pre(const parser::OmpCriticalDirective &x) {
-    checker_(std::get<parser::Verbatim>(x.t).source, Directive::OMPD_critical);
-    return false;
-  }
-  bool Pre(const parser::OmpEndCriticalDirective &x) {
-    checker_(std::get<parser::Verbatim>(x.t).source, Directive::OMPD_critical);
-    return false;
-  }
   bool Pre(const parser::OmpMetadirectiveDirective &x) {
     checker_(
         std::get<parser::Verbatim>(x.t).source, Directive::OMPD_metadirective);
@@ -2034,41 +2026,87 @@ void OmpStructureChecker::Leave(const parser::OpenMPCancelConstruct &) {
 }
 
 void OmpStructureChecker::Enter(const parser::OpenMPCriticalConstruct &x) {
-  const auto &dir{std::get<parser::OmpCriticalDirective>(x.t)};
-  const auto &dirSource{std::get<parser::Verbatim>(dir.t).source};
-  const auto &endDir{std::get<parser::OmpEndCriticalDirective>(x.t)};
-  PushContextAndClauseSets(dirSource, llvm::omp::Directive::OMPD_critical);
+  const parser::OmpBeginDirective &beginSpec{x.BeginDir()};
+  const std::optional<parser::OmpEndDirective> &endSpec{x.EndDir()};
+  PushContextAndClauseSets(beginSpec.DirName().source, beginSpec.DirName().v);
+
   const auto &block{std::get<parser::Block>(x.t)};
-  CheckNoBranching(block, llvm::omp::Directive::OMPD_critical, dir.source);
-  const auto &dirName{std::get<std::optional<parser::Name>>(dir.t)};
-  const auto &endDirName{std::get<std::optional<parser::Name>>(endDir.t)};
-  const auto &ompClause{std::get<parser::OmpClauseList>(dir.t)};
-  if (dirName && endDirName &&
-      dirName->ToString().compare(endDirName->ToString())) {
-    context_
-        .Say(endDirName->source,
-            parser::MessageFormattedText{
-                "CRITICAL directive names do not match"_err_en_US})
-        .Attach(dirName->source, "should be "_en_US);
-  } else if (dirName && !endDirName) {
-    context_
-        .Say(dirName->source,
-            parser::MessageFormattedText{
-                "CRITICAL directive names do not match"_err_en_US})
-        .Attach(dirName->source, "should be NULL"_en_US);
-  } else if (!dirName && endDirName) {
-    context_
-        .Say(endDirName->source,
-            parser::MessageFormattedText{
-                "CRITICAL directive names do not match"_err_en_US})
-        .Attach(endDirName->source, "should be NULL"_en_US);
-  }
-  if (!dirName && !ompClause.source.empty() &&
-      ompClause.source.NULTerminatedToString() != "hint(omp_sync_hint_none)") {
-    context_.Say(dir.source,
-        parser::MessageFormattedText{
-            "Hint clause other than omp_sync_hint_none cannot be specified for "
-            "an unnamed CRITICAL directive"_err_en_US});
+  CheckNoBranching(
+      block, llvm::omp::Directive::OMPD_critical, beginSpec.DirName().source);
+
+  auto getNameFromArg{[](const parser::OmpArgument &arg) {
+    if (auto *object{parser::Unwrap<parser::OmpObject>(arg.u)}) {
+      if (auto *designator{omp::GetDesignatorFromObj(*object)}) {
+        return getDesignatorNameIfDataRef(*designator);
+      }
+    }
+    return static_cast<const parser::Name *>(nullptr);
+  }};
+
+  auto checkArgumentList{[&](const parser::OmpArgumentList &args) {
+    if (args.v.size() > 1) {
+      context_.Say(args.source,
+          "Only a single argument is allowed in CRITICAL directive"_err_en_US);
+    } else if (!args.v.empty()) {
+      if (!getNameFromArg(args.v.front())) {
+        context_.Say(args.v.front().source,
+            "CRITICAL argument should be a name"_err_en_US);
+      }
+    }
+  }};
+
+  const parser::Name *beginName{nullptr};
+  const parser::Name *endName{nullptr};
+
+  auto &beginArgs{beginSpec.Arguments()};
+  checkArgumentList(beginArgs);
+
+  if (!beginArgs.v.empty()) {
+    beginName = getNameFromArg(beginArgs.v.front());
+  }
+
+  if (endSpec) {
+    auto &endArgs{endSpec->Arguments()};
+    checkArgumentList(endArgs);
+
+    if (beginArgs.v.empty() != endArgs.v.empty()) {
+      parser::CharBlock source{
+          beginArgs.v.empty() ? endArgs.source : beginArgs.source};
+      context_.Say(source,
+          "Either both CRITICAL and END CRITICAL should have an argument, or none of them should"_err_en_US);
+    } else if (!beginArgs.v.empty()) {
+      endName = getNameFromArg(endArgs.v.front());
+      if (beginName && endName) {
+        if (beginName->ToString() != endName->ToString()) {
+          context_.Say(endName->source,
+              "The names on CRITICAL and END CRITICAL must match"_err_en_US);
+        }
+      }
+    }
+  }
+
+  for (auto &clause : beginSpec.Clauses().v) {
+    auto *hint{std::get_if<parser::OmpClause::Hint>(&clause.u)};
+    if (!hint) {
+      continue;
+    }
+    const int64_t OmpSyncHintNone = 0; // omp_sync_hint_none
+    std::optional<int64_t> hintValue{GetIntValue(hint->v.v)};
+    if (hintValue && *hintValue != OmpSyncHintNone) {
+      // Emit a diagnostic if the name is missing, and point to the directive
+      // with a missing name.
+      parser::CharBlock source;
+      if (!beginName) {
+        source = beginSpec.DirName().source;
+      } else if (endSpec && !endName) {
+        source = endSpec->DirName().source;
+      }
+
+      if (!source.empty()) {
+        context_.Say(source,
+            "When HINT other than 'omp_sync_hint_none' is present, CRITICAL directive should have a name"_err_en_US);
+      }
+    }
   }
 }
 
diff --git a/flang/lib/Semantics/openmp-utils.cpp b/flang/lib/Semantics/openmp-utils.cpp
index 7a492a4..e8df346c 100644
--- a/flang/lib/Semantics/openmp-utils.cpp
+++ b/flang/lib/Semantics/openmp-utils.cpp
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "openmp-utils.h"
+#include "flang/Semantics/openmp-utils.h"
 
 #include "flang/Common/indirection.h"
 #include "flang/Common/reference.h"
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 0557b08..fe0d2a7 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -10,7 +10,6 @@
 
 #include "check-acc-structure.h"
 #include "check-omp-structure.h"
-#include "openmp-utils.h"
 #include "resolve-names-utils.h"
 #include "flang/Common/idioms.h"
 #include "flang/Evaluate/fold.h"
@@ -22,6 +21,7 @@
 #include "flang/Semantics/expression.h"
 #include "flang/Semantics/openmp-dsa.h"
 #include "flang/Semantics/openmp-modifiers.h"
+#include "flang/Semantics/openmp-utils.h"
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
 #include "flang/Support/Flags.h"
@@ -876,6 +876,9 @@ private:
 
   bool IsNestedInDirective(llvm::omp::Directive directive);
   void ResolveOmpObjectList(const parser::OmpObjectList &, Symbol::Flag);
+  void ResolveOmpDesignator(
+      const parser::Designator &designator, Symbol::Flag ompFlag);
+  void ResolveOmpCommonBlock(const parser::Name &name, Symbol::Flag ompFlag);
   void ResolveOmpObject(const parser::OmpObject &, Symbol::Flag);
   Symbol *ResolveOmp(const parser::Name &, Symbol::Flag, Scope &);
   Symbol *ResolveOmp(Symbol &, Symbol::Flag, Scope &);
@@ -2139,8 +2142,8 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPSectionConstruct &x) {
 }
 
 bool OmpAttributeVisitor::Pre(const parser::OpenMPCriticalConstruct &x) {
-  const auto &beginCriticalDir{std::get<parser::OmpCriticalDirective>(x.t)};
-  PushContext(beginCriticalDir.source, llvm::omp::Directive::OMPD_critical);
+  const parser::OmpBeginDirective &beginSpec{x.BeginDir()};
+  PushContext(beginSpec.DirName().source, beginSpec.DirName().v);
   GetContext().withinConstruct = true;
   return true;
 }
@@ -2786,196 +2789,182 @@ static bool SymbolOrEquivalentIsInNamelist(const Symbol &symbol) {
   });
 }
 
-void OmpAttributeVisitor::ResolveOmpObject(
-    const parser::OmpObject &ompObject, Symbol::Flag ompFlag) {
+void OmpAttributeVisitor::ResolveOmpDesignator(
+    const parser::Designator &designator, Symbol::Flag ompFlag) {
   unsigned version{context_.langOptions().OpenMPVersion};
-  common::visit(
-      common::visitors{
-          [&](const parser::Designator &designator) {
-            if (const auto *name{
-                    semantics::getDesignatorNameIfDataRef(designator)}) {
-              if (auto *symbol{ResolveOmp(*name, ompFlag, currScope())}) {
-                auto checkExclusivelists =
-                    [&](const Symbol *symbol1, Symbol::Flag firstOmpFlag,
-                        const Symbol *symbol2, Symbol::Flag secondOmpFlag) {
-                      if ((symbol1->test(firstOmpFlag) &&
-                              symbol2->test(secondOmpFlag)) ||
-                          (symbol1->test(secondOmpFlag) &&
-                              symbol2->test(firstOmpFlag))) {
-                        context_.Say(designator.source,
-                            "Variable '%s' may not "
-                            "appear on both %s and %s "
-                            "clauses on a %s construct"_err_en_US,
-                            symbol2->name(),
-                            Symbol::OmpFlagToClauseName(firstOmpFlag),
-                            Symbol::OmpFlagToClauseName(secondOmpFlag),
-                            parser::ToUpperCaseLetters(
-                                llvm::omp::getOpenMPDirectiveName(
-                                    GetContext().directive, version)
-                                    .str()));
-                      }
-                    };
-                if (dataCopyingAttributeFlags.test(ompFlag)) {
-                  CheckDataCopyingClause(*name, *symbol, ompFlag);
-                } else {
-                  AddToContextObjectWithExplicitDSA(*symbol, ompFlag);
-                  if (dataSharingAttributeFlags.test(ompFlag)) {
-                    CheckMultipleAppearances(*name, *symbol, ompFlag);
-                  }
-                  if (privateDataSharingAttributeFlags.test(ompFlag)) {
-                    CheckObjectIsPrivatizable(*name, *symbol, ompFlag);
-                  }
+  llvm::omp::Directive directive{GetContext().directive};
 
-                  if (ompFlag == Symbol::Flag::OmpAllocate) {
-                    AddAllocateName(name);
-                  }
-                }
-                if (ompFlag == Symbol::Flag::OmpDeclarativeAllocateDirective &&
-                    IsAllocatable(*symbol) &&
-                    !IsNestedInDirective(llvm::omp::Directive::OMPD_allocate)) {
-                  context_.Say(designator.source,
-                      "List items specified in the ALLOCATE directive must not "
-                      "have the ALLOCATABLE attribute unless the directive is "
-                      "associated with an ALLOCATE statement"_err_en_US);
-                }
-                if ((ompFlag == Symbol::Flag::OmpDeclarativeAllocateDirective ||
-                        ompFlag ==
-                            Symbol::Flag::OmpExecutableAllocateDirective) &&
-                    ResolveOmpObjectScope(name) == nullptr) {
-                  context_.Say(designator.source, // 2.15.3
-                      "List items must be declared in the same scoping unit "
-                      "in which the %s directive appears"_err_en_US,
-                      parser::ToUpperCaseLetters(
-                          llvm::omp::getOpenMPDirectiveName(
-                              GetContext().directive, version)
-                              .str()));
-                }
-                if (ompFlag == Symbol::Flag::OmpReduction) {
-                  // Using variables inside of a namelist in OpenMP reductions
-                  // is allowed by the standard, but is not allowed for
-                  // privatisation. This looks like an oversight. If the
-                  // namelist is hoisted to a global, we cannot apply the
-                  // mapping for the reduction variable: resulting in incorrect
-                  // results. Disabling this hoisting could make some real
-                  // production code go slower. See discussion in #109303
-                  if (SymbolOrEquivalentIsInNamelist(*symbol)) {
-                    context_.Say(name->source,
-                        "Variable '%s' in NAMELIST cannot be in a REDUCTION clause"_err_en_US,
-                        name->ToString());
-                  }
-                }
-                if (ompFlag == Symbol::Flag::OmpInclusiveScan ||
-                    ompFlag == Symbol::Flag::OmpExclusiveScan) {
-                  if (!symbol->test(Symbol::Flag::OmpInScanReduction)) {
-                    context_.Say(name->source,
-                        "List item %s must appear in REDUCTION clause "
-                        "with the INSCAN modifier of the parent "
-                        "directive"_err_en_US,
-                        name->ToString());
-                  }
-                }
-                if (ompFlag == Symbol::Flag::OmpDeclareTarget) {
-                  if (symbol->IsFuncResult()) {
-                    if (Symbol * func{currScope().symbol()}) {
-                      CHECK(func->IsSubprogram());
-                      func->set(ompFlag);
-                      name->symbol = func;
-                    }
-                  }
-                }
-                if (GetContext().directive ==
-                    llvm::omp::Directive::OMPD_target_data) {
-                  checkExclusivelists(symbol, Symbol::Flag::OmpUseDevicePtr,
-                      symbol, Symbol::Flag::OmpUseDeviceAddr);
-                }
-                if (llvm::omp::allDistributeSet.test(GetContext().directive)) {
-                  checkExclusivelists(symbol, Symbol::Flag::OmpFirstPrivate,
-                      symbol, Symbol::Flag::OmpLastPrivate);
-                }
-                if (llvm::omp::allTargetSet.test(GetContext().directive)) {
-                  checkExclusivelists(symbol, Symbol::Flag::OmpIsDevicePtr,
-                      symbol, Symbol::Flag::OmpHasDeviceAddr);
-                  const auto *hostAssocSym{symbol};
-                  if (!(symbol->test(Symbol::Flag::OmpIsDevicePtr) ||
-                          symbol->test(Symbol::Flag::OmpHasDeviceAddr))) {
-                    if (const auto *details{
-                            symbol->detailsIf<HostAssocDetails>()}) {
-                      hostAssocSym = &details->symbol();
-                    }
-                  }
-                  Symbol::Flag dataMappingAttributeFlags[] = {
-                      Symbol::Flag::OmpMapTo, Symbol::Flag::OmpMapFrom,
-                      Symbol::Flag::OmpMapToFrom, Symbol::Flag::OmpMapStorage,
-                      Symbol::Flag::OmpMapDelete, Symbol::Flag::OmpIsDevicePtr,
-                      Symbol::Flag::OmpHasDeviceAddr};
-
-                  Symbol::Flag dataSharingAttributeFlags[] = {
-                      Symbol::Flag::OmpPrivate, Symbol::Flag::OmpFirstPrivate,
-                      Symbol::Flag::OmpLastPrivate, Symbol::Flag::OmpShared,
-                      Symbol::Flag::OmpLinear};
-
-                  // For OMP TARGET TEAMS directive some sharing attribute
-                  // flags and mapping attribute flags can co-exist.
-                  if (!(llvm::omp::allTeamsSet.test(GetContext().directive) ||
-                          llvm::omp::allParallelSet.test(
-                              GetContext().directive))) {
-                    for (Symbol::Flag ompFlag1 : dataMappingAttributeFlags) {
-                      for (Symbol::Flag ompFlag2 : dataSharingAttributeFlags) {
-                        if ((hostAssocSym->test(ompFlag2) &&
-                                hostAssocSym->test(
-                                    Symbol::Flag::OmpExplicit)) ||
-                            (symbol->test(ompFlag2) &&
-                                symbol->test(Symbol::Flag::OmpExplicit))) {
-                          checkExclusivelists(
-                              hostAssocSym, ompFlag1, symbol, ompFlag2);
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            } else {
-              // Array sections to be changed to substrings as needed
-              if (AnalyzeExpr(context_, designator)) {
-                if (std::holds_alternative<parser::Substring>(designator.u)) {
-                  context_.Say(designator.source,
-                      "Substrings are not allowed on OpenMP "
-                      "directives or clauses"_err_en_US);
-                }
-              }
-              // other checks, more TBD
-            }
-          },
-          [&](const parser::Name &name) { // common block
-            if (auto *symbol{ResolveOmpCommonBlockName(&name)}) {
-              if (!dataCopyingAttributeFlags.test(ompFlag)) {
-                CheckMultipleAppearances(
-                    name, *symbol, Symbol::Flag::OmpCommonBlock);
-              }
-              // 2.15.3 When a named common block appears in a list, it has the
-              // same meaning as if every explicit member of the common block
-              // appeared in the list
-              auto &details{symbol->get<CommonBlockDetails>()};
-              unsigned index{0};
-              for (auto &object : details.objects()) {
-                if (auto *resolvedObject{
-                        ResolveOmp(*object, ompFlag, currScope())}) {
-                  if (dataCopyingAttributeFlags.test(ompFlag)) {
-                    CheckDataCopyingClause(name, *resolvedObject, ompFlag);
-                  } else {
-                    AddToContextObjectWithExplicitDSA(*resolvedObject, ompFlag);
-                  }
-                  details.replace_object(*resolvedObject, index);
-                }
-                index++;
-              }
-            } else {
-              context_.Say(name.source, // 2.15.3
-                  "COMMON block must be declared in the same scoping unit "
-                  "in which the OpenMP directive or clause appears"_err_en_US);
+  const auto *name{semantics::getDesignatorNameIfDataRef(designator)};
+  if (!name) {
+    // Array sections to be changed to substrings as needed
+    if (AnalyzeExpr(context_, designator)) {
+      if (std::holds_alternative<parser::Substring>(designator.u)) {
+        context_.Say(designator.source,
+            "Substrings are not allowed on OpenMP directives or clauses"_err_en_US);
+      }
+    }
+    // other checks, more TBD
+    return;
+  }
+
+  if (auto *symbol{ResolveOmp(*name, ompFlag, currScope())}) {
+    auto checkExclusivelists{//
+        [&](const Symbol *symbol1, Symbol::Flag firstOmpFlag,
+            const Symbol *symbol2, Symbol::Flag secondOmpFlag) {
+          if ((symbol1->test(firstOmpFlag) && symbol2->test(secondOmpFlag)) ||
+              (symbol1->test(secondOmpFlag) && symbol2->test(firstOmpFlag))) {
+            context_.Say(designator.source,
+                "Variable '%s' may not appear on both %s and %s clauses on a %s construct"_err_en_US,
+                symbol2->name(), Symbol::OmpFlagToClauseName(firstOmpFlag),
+                Symbol::OmpFlagToClauseName(secondOmpFlag),
+                parser::ToUpperCaseLetters(
+                    llvm::omp::getOpenMPDirectiveName(directive, version)));
+          }
+        }};
+    if (dataCopyingAttributeFlags.test(ompFlag)) {
+      CheckDataCopyingClause(*name, *symbol, ompFlag);
+    } else {
+      AddToContextObjectWithExplicitDSA(*symbol, ompFlag);
+      if (dataSharingAttributeFlags.test(ompFlag)) {
+        CheckMultipleAppearances(*name, *symbol, ompFlag);
+      }
+      if (privateDataSharingAttributeFlags.test(ompFlag)) {
+        CheckObjectIsPrivatizable(*name, *symbol, ompFlag);
+      }
+
+      if (ompFlag == Symbol::Flag::OmpAllocate) {
+        AddAllocateName(name);
+      }
+    }
+    if (ompFlag == Symbol::Flag::OmpDeclarativeAllocateDirective &&
+        IsAllocatable(*symbol) &&
+        !IsNestedInDirective(llvm::omp::Directive::OMPD_allocate)) {
+      context_.Say(designator.source,
+          "List items specified in the ALLOCATE directive must not have the ALLOCATABLE attribute unless the directive is associated with an ALLOCATE statement"_err_en_US);
+    }
+    if ((ompFlag == Symbol::Flag::OmpDeclarativeAllocateDirective ||
+            ompFlag == Symbol::Flag::OmpExecutableAllocateDirective) &&
+        ResolveOmpObjectScope(name) == nullptr) {
+      context_.Say(designator.source, // 2.15.3
+          "List items must be declared in the same scoping unit in which the %s directive appears"_err_en_US,
+          parser::ToUpperCaseLetters(
+              llvm::omp::getOpenMPDirectiveName(directive, version)));
+    }
+    if (ompFlag == Symbol::Flag::OmpReduction) {
+      // Using variables inside of a namelist in OpenMP reductions
+      // is allowed by the standard, but is not allowed for
+      // privatisation. This looks like an oversight. If the
+      // namelist is hoisted to a global, we cannot apply the
+      // mapping for the reduction variable: resulting in incorrect
+      // results. Disabling this hoisting could make some real
+      // production code go slower. See discussion in #109303
+      if (SymbolOrEquivalentIsInNamelist(*symbol)) {
+        context_.Say(name->source,
+            "Variable '%s' in NAMELIST cannot be in a REDUCTION clause"_err_en_US,
+            name->ToString());
+      }
+    }
+    if (ompFlag == Symbol::Flag::OmpInclusiveScan ||
+        ompFlag == Symbol::Flag::OmpExclusiveScan) {
+      if (!symbol->test(Symbol::Flag::OmpInScanReduction)) {
+        context_.Say(name->source,
+            "List item %s must appear in REDUCTION clause with the INSCAN modifier of the parent directive"_err_en_US,
+            name->ToString());
+      }
+    }
+    if (ompFlag == Symbol::Flag::OmpDeclareTarget) {
+      if (symbol->IsFuncResult()) {
+        if (Symbol * func{currScope().symbol()}) {
+          CHECK(func->IsSubprogram());
+          func->set(ompFlag);
+          name->symbol = func;
+        }
+      }
+    }
+    if (directive == llvm::omp::Directive::OMPD_target_data) {
+      checkExclusivelists(symbol, Symbol::Flag::OmpUseDevicePtr, symbol,
+          Symbol::Flag::OmpUseDeviceAddr);
+    }
+    if (llvm::omp::allDistributeSet.test(directive)) {
+      checkExclusivelists(symbol, Symbol::Flag::OmpFirstPrivate, symbol,
+          Symbol::Flag::OmpLastPrivate);
+    }
+    if (llvm::omp::allTargetSet.test(directive)) {
+      checkExclusivelists(symbol, Symbol::Flag::OmpIsDevicePtr, symbol,
+          Symbol::Flag::OmpHasDeviceAddr);
+      const auto *hostAssocSym{symbol};
+      if (!symbol->test(Symbol::Flag::OmpIsDevicePtr) &&
+          !symbol->test(Symbol::Flag::OmpHasDeviceAddr)) {
+        if (const auto *details{symbol->detailsIf<HostAssocDetails>()}) {
+          hostAssocSym = &details->symbol();
+        }
+      }
+      static Symbol::Flag dataMappingAttributeFlags[] = {//
+          Symbol::Flag::OmpMapTo, Symbol::Flag::OmpMapFrom,
+          Symbol::Flag::OmpMapToFrom, Symbol::Flag::OmpMapStorage,
+          Symbol::Flag::OmpMapDelete, Symbol::Flag::OmpIsDevicePtr,
+          Symbol::Flag::OmpHasDeviceAddr};
+
+      static Symbol::Flag dataSharingAttributeFlags[] = {//
+          Symbol::Flag::OmpPrivate, Symbol::Flag::OmpFirstPrivate,
+          Symbol::Flag::OmpLastPrivate, Symbol::Flag::OmpShared,
+          Symbol::Flag::OmpLinear};
+
+      // For OMP TARGET TEAMS directive some sharing attribute
+      // flags and mapping attribute flags can co-exist.
+      if (!llvm::omp::allTeamsSet.test(directive) &&
+          !llvm::omp::allParallelSet.test(directive)) {
+        for (Symbol::Flag ompFlag1 : dataMappingAttributeFlags) {
+          for (Symbol::Flag ompFlag2 : dataSharingAttributeFlags) {
+            if ((hostAssocSym->test(ompFlag2) &&
+                    hostAssocSym->test(Symbol::Flag::OmpExplicit)) ||
+                (symbol->test(ompFlag2) &&
+                    symbol->test(Symbol::Flag::OmpExplicit))) {
+              checkExclusivelists(hostAssocSym, ompFlag1, symbol, ompFlag2);
             }
-          },
-      },
+          }
+        }
+      }
+    }
+  }
+}
+
+void OmpAttributeVisitor::ResolveOmpCommonBlock(
+    const parser::Name &name, Symbol::Flag ompFlag) {
+  if (auto *symbol{ResolveOmpCommonBlockName(&name)}) {
+    if (!dataCopyingAttributeFlags.test(ompFlag)) {
+      CheckMultipleAppearances(name, *symbol, Symbol::Flag::OmpCommonBlock);
+    }
+    // 2.15.3 When a named common block appears in a list, it has the
+    // same meaning as if every explicit member of the common block
+    // appeared in the list
+    auto &details{symbol->get<CommonBlockDetails>()};
+    for (auto [index, object] : llvm::enumerate(details.objects())) {
+      if (auto *resolvedObject{ResolveOmp(*object, ompFlag, currScope())}) {
+        if (dataCopyingAttributeFlags.test(ompFlag)) {
+          CheckDataCopyingClause(name, *resolvedObject, ompFlag);
+        } else {
+          AddToContextObjectWithExplicitDSA(*resolvedObject, ompFlag);
+        }
+        details.replace_object(*resolvedObject, index);
+      }
+    }
+  } else {
+    context_.Say(name.source, // 2.15.3
+        "COMMON block must be declared in the same scoping unit in which the OpenMP directive or clause appears"_err_en_US);
+  }
+}
+
+void OmpAttributeVisitor::ResolveOmpObject(
+    const parser::OmpObject &ompObject, Symbol::Flag ompFlag) {
+  common::visit(common::visitors{
+                    [&](const parser::Designator &designator) {
+                      ResolveOmpDesignator(designator, ompFlag);
+                    },
+                    [&](const parser::Name &name) { // common block
+                      ResolveOmpCommonBlock(name, ompFlag);
+                    },
+                },
       ompObject.u);
 }
 
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 66a45dd..5808b4b 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -30,6 +30,7 @@
 #include "flang/Semantics/attr.h"
 #include "flang/Semantics/expression.h"
 #include "flang/Semantics/openmp-modifiers.h"
+#include "flang/Semantics/openmp-utils.h"
 #include "flang/Semantics/program-tree.h"
 #include "flang/Semantics/scope.h"
 #include "flang/Semantics/semantics.h"
@@ -1486,6 +1487,16 @@ public:
   void Post(const parser::OpenMPBlockConstruct &);
   bool Pre(const parser::OmpBeginDirective &x) {
     AddOmpSourceRange(x.source);
+    // Manually resolve names in CRITICAL directives. This is because these
+    // names do not denote Fortran objects, and the CRITICAL directive causes
+    // them to be "auto-declared", i.e. inserted into the global scope.
+    // More specifically, they are not expected to have explicit declarations,
+    // and if they do the behavior is unspeficied.
+    if (x.DirName().v == llvm::omp::Directive::OMPD_critical) {
+      for (const parser::OmpArgument &arg : x.Arguments().v) {
+        ResolveCriticalName(arg);
+      }
+    }
     return true;
   }
   void Post(const parser::OmpBeginDirective &) {
@@ -1493,6 +1504,12 @@ public:
   }
   bool Pre(const parser::OmpEndDirective &x) {
     AddOmpSourceRange(x.source);
+    // Manually resolve names in CRITICAL directives.
+    if (x.DirName().v == llvm::omp::Directive::OMPD_critical) {
+      for (const parser::OmpArgument &arg : x.Arguments().v) {
+        ResolveCriticalName(arg);
+      }
+    }
     return true;
   }
   void Post(const parser::OmpEndDirective &) {
@@ -1591,32 +1608,6 @@ public:
   void Post(const parser::OmpEndSectionsDirective &) {
     messageHandler().set_currStmtSource(std::nullopt);
   }
-  bool Pre(const parser::OmpCriticalDirective &x) {
-    AddOmpSourceRange(x.source);
-    // Manually resolve names in CRITICAL directives. This is because these
-    // names do not denote Fortran objects, and the CRITICAL directive causes
-    // them to be "auto-declared", i.e. inserted into the global scope.
-    // More specifically, they are not expected to have explicit declarations,
-    // and if they do the behavior is unspeficied.
-    if (auto &maybeName{std::get<std::optional<parser::Name>>(x.t)}) {
-      ResolveCriticalName(*maybeName);
-    }
-    return true;
-  }
-  void Post(const parser::OmpCriticalDirective &) {
-    messageHandler().set_currStmtSource(std::nullopt);
-  }
-  bool Pre(const parser::OmpEndCriticalDirective &x) {
-    AddOmpSourceRange(x.source);
-    // Manually resolve names in CRITICAL directives.
-    if (auto &maybeName{std::get<std::optional<parser::Name>>(x.t)}) {
-      ResolveCriticalName(*maybeName);
-    }
-    return true;
-  }
-  void Post(const parser::OmpEndCriticalDirective &) {
-    messageHandler().set_currStmtSource(std::nullopt);
-  }
   bool Pre(const parser::OpenMPThreadprivate &) {
     SkipImplicitTyping(true);
     return true;
@@ -1732,7 +1723,7 @@ private:
       const std::optional<parser::OmpClauseList> &clauses,
       const T &wholeConstruct);
 
-  void ResolveCriticalName(const parser::Name &name);
+  void ResolveCriticalName(const parser::OmpArgument &arg);
 
   int metaLevel_{0};
   const parser::OmpMetadirectiveDirective *metaDirective_{nullptr};
@@ -1961,7 +1952,7 @@ void OmpVisitor::ProcessReductionSpecifier(
   }
 }
 
-void OmpVisitor::ResolveCriticalName(const parser::Name &name) {
+void OmpVisitor::ResolveCriticalName(const parser::OmpArgument &arg) {
   auto &globalScope{[&]() -> Scope & {
     for (Scope *s{&currScope()};; s = &s->parent()) {
       if (s->IsTopLevel()) {
@@ -1971,15 +1962,21 @@ void OmpVisitor::ResolveCriticalName(const parser::Name &name) {
     llvm_unreachable("Cannot find global scope");
   }()};
 
-  if (auto *symbol{FindInScope(globalScope, name)}) {
-    if (!symbol->test(Symbol::Flag::OmpCriticalLock)) {
-      SayWithDecl(name, *symbol,
-          "CRITICAL construct name '%s' conflicts with a previous declaration"_warn_en_US,
-          name.ToString());
+  if (auto *object{parser::Unwrap<parser::OmpObject>(arg.u)}) {
+    if (auto *desg{omp::GetDesignatorFromObj(*object)}) {
+      if (auto *name{getDesignatorNameIfDataRef(*desg)}) {
+        if (auto *symbol{FindInScope(globalScope, *name)}) {
+          if (!symbol->test(Symbol::Flag::OmpCriticalLock)) {
+            SayWithDecl(*name, *symbol,
+                "CRITICAL construct name '%s' conflicts with a previous declaration"_warn_en_US,
+                name->ToString());
+          }
+        } else {
+          name->symbol = &MakeSymbol(globalScope, name->source, Attrs{});
+          name->symbol->set(Symbol::Flag::OmpCriticalLock);
+        }
+      }
     }
-  } else {
-    name.symbol = &MakeSymbol(globalScope, name.source, Attrs{});
-    name.symbol->set(Symbol::Flag::OmpCriticalLock);
   }
 }
 
diff --git a/flang/lib/Semantics/unparse-with-symbols.cpp b/flang/lib/Semantics/unparse-with-symbols.cpp
index 3093e39..41077e0 100644
--- a/flang/lib/Semantics/unparse-with-symbols.cpp
+++ b/flang/lib/Semantics/unparse-with-symbols.cpp
@@ -70,20 +70,6 @@ public:
     currStmt_ = std::nullopt;
   }
 
-  bool Pre(const parser::OmpCriticalDirective &x) {
-    currStmt_ = x.source;
-    return true;
-  }
-  void Post(const parser::OmpCriticalDirective &) { currStmt_ = std::nullopt; }
-
-  bool Pre(const parser::OmpEndCriticalDirective &x) {
-    currStmt_ = x.source;
-    return true;
-  }
-  void Post(const parser::OmpEndCriticalDirective &) {
-    currStmt_ = std::nullopt;
-  }
-
   // Directive arguments can be objects with symbols.
   bool Pre(const parser::OmpBeginDirective &x) {
     currStmt_ = x.source;
diff --git a/flang/test/Driver/atomic-control-options.f90 b/flang/test/Driver/atomic-control-options.f90
new file mode 100644
index 0000000..cb382f9
--- /dev/null
+++ b/flang/test/Driver/atomic-control-options.f90
@@ -0,0 +1,20 @@
+! REQUIRES: amdgpu-registered-target
+! RUN: %flang_fc1 -emit-llvm -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-is-device -munsafe-fp-atomics %s -o -|FileCheck -check-prefix=UNSAFE-FP-ATOMICS %s
+! RUN: %flang_fc1 -emit-llvm -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-is-device -fatomic-ignore-denormal-mode %s -o -|FileCheck -check-prefix=IGNORE-DENORMAL-MODE %s
+! RUN: %flang_fc1 -emit-llvm -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-is-device -fatomic-fine-grained-memory %s -o -|FileCheck -check-prefix=FINE-GRAINED-MEMORY %s
+! RUN: %flang_fc1 -emit-llvm -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-is-device -fatomic-remote-memory %s -o -|FileCheck -check-prefix=REMOTE-MEMORY %s
+program test
+    implicit none
+    integer :: A, threads
+    threads = 128
+    A = 0
+    !$omp target parallel num_threads(threads)
+    !$omp atomic
+    A =  A + 1
+    !$omp end target parallel
+end program test
+
+!UNSAFE-FP-ATOMICS: %{{.*}} = atomicrmw add ptr {{.*}}, i32 1 monotonic, align 4, !amdgpu.ignore.denormal.mode !{{.*}}, !amdgpu.no.fine.grained.memory !{{.*}}, !amdgpu.no.remote.memory !{{.*}}
+!IGNORE-DENORMAL-MODE: %{{.*}} = atomicrmw add ptr {{.*}}, i32 1 monotonic, align 4, !amdgpu.ignore.denormal.mode !{{.*}}, !amdgpu.no.fine.grained.memory !{{.*}}, !amdgpu.no.remote.memory !{{.*}}
+!FINE-GRAINED-MEMORY: %{{.*}} = atomicrmw add ptr {{.*}}, i32 1 monotonic, align 4, !amdgpu.no.remote.memory !{{.*}}
+!REMOTE-MEMORY: %{{.*}} = atomicrmw add ptr {{.*}}, i32 1 monotonic, align 4, !amdgpu.no.fine.grained.memory !{{.*}}
diff --git a/flang/test/Parser/OpenMP/critical-unparse-with-symbols.f90 b/flang/test/Parser/OpenMP/critical-unparse-with-symbols.f90
index 4d0d93a..e5e7561 100644
--- a/flang/test/Parser/OpenMP/critical-unparse-with-symbols.f90
+++ b/flang/test/Parser/OpenMP/critical-unparse-with-symbols.f90
@@ -13,9 +13,9 @@ end
 !UNPARSE:  implicit none
 !UNPARSE:  !DEF: /f/x ObjectEntity INTEGER(4)
 !UNPARSE:  integer x
-!UNPARSE: !$omp critical (c)
+!UNPARSE: !$omp critical(c)
 !UNPARSE:  !REF: /f/x
 !UNPARSE:  x = 0
-!UNPARSE: !$omp end critical (c)
+!UNPARSE: !$omp end critical(c)
 !UNPARSE: end subroutine
 
diff --git a/flang/test/Semantics/OpenMP/sync-critical01.f90 b/flang/test/Semantics/OpenMP/sync-critical01.f90
index b597eb1..01cc0ac 100644
--- a/flang/test/Semantics/OpenMP/sync-critical01.f90
+++ b/flang/test/Semantics/OpenMP/sync-critical01.f90
@@ -17,22 +17,22 @@ integer function timer_tick_sec()
 
    !$OMP CRITICAL (foo)
    t = t + 1
-   !ERROR: CRITICAL directive names do not match
+   !ERROR: The names on CRITICAL and END CRITICAL must match
    !$OMP END CRITICAL (bar)
 
    !$OMP CRITICAL (bar)
    t = t + 1
-   !ERROR: CRITICAL directive names do not match
+   !ERROR: The names on CRITICAL and END CRITICAL must match
    !$OMP END CRITICAL (foo)
 
-   !ERROR: CRITICAL directive names do not match
+   !ERROR: Either both CRITICAL and END CRITICAL should have an argument, or none of them should
    !$OMP CRITICAL (bar)
    t = t + 1
    !$OMP END CRITICAL
 
    !$OMP CRITICAL
    t = t + 1
-   !ERROR: CRITICAL directive names do not match
+   !ERROR: Either both CRITICAL and END CRITICAL should have an argument, or none of them should
    !$OMP END CRITICAL (foo)
 
    timer_tick_sec = t
diff --git a/flang/test/Semantics/OpenMP/sync-critical02.f90 b/flang/test/Semantics/OpenMP/sync-critical02.f90
index 1fa9d6a..b77bd66 100644
--- a/flang/test/Semantics/OpenMP/sync-critical02.f90
+++ b/flang/test/Semantics/OpenMP/sync-critical02.f90
@@ -8,7 +8,7 @@
 program sample
    use omp_lib
    integer i, j
-   !ERROR: Hint clause other than omp_sync_hint_none cannot be specified for an unnamed CRITICAL directive
+   !ERROR: When HINT other than 'omp_sync_hint_none' is present, CRITICAL directive should have a name
    !$omp critical hint(omp_lock_hint_speculative)
    j = j + 1
    !$omp end critical
@@ -17,7 +17,7 @@ program sample
    i = i - 1
    !$omp end critical (foo)
 
-   !ERROR: Hint clause other than omp_sync_hint_none cannot be specified for an unnamed CRITICAL directive
+   !ERROR: When HINT other than 'omp_sync_hint_none' is present, CRITICAL directive should have a name
    !$omp critical hint(omp_lock_hint_nonspeculative)
    j = j + 1
    !$omp end critical
@@ -26,7 +26,7 @@ program sample
    i = i - 1
    !$omp end critical (foo)
 
-   !ERROR: Hint clause other than omp_sync_hint_none cannot be specified for an unnamed CRITICAL directive
+   !ERROR: When HINT other than 'omp_sync_hint_none' is present, CRITICAL directive should have a name
    !$omp critical hint(omp_lock_hint_contended)
    j = j + 1
    !$omp end critical
@@ -35,7 +35,7 @@ program sample
    i = i - 1
    !$omp end critical (foo)
 
-   !ERROR: Hint clause other than omp_sync_hint_none cannot be specified for an unnamed CRITICAL directive
+   !ERROR: When HINT other than 'omp_sync_hint_none' is present, CRITICAL directive should have a name
    !$omp critical hint(omp_lock_hint_uncontended)
    j = j + 1
    !$omp end critical
diff --git a/libc/src/dlfcn/CMakeLists.txt b/libc/src/dlfcn/CMakeLists.txt
index 8ef0540..2ee3ac0 100644
--- a/libc/src/dlfcn/CMakeLists.txt
+++ b/libc/src/dlfcn/CMakeLists.txt
@@ -14,7 +14,6 @@ add_entrypoint_object(
     dlerror.h
   DEPENDS
     libc.include.dlfcn
-    libc.src.errno.errno
 )
 
 add_entrypoint_object(
@@ -25,7 +24,6 @@ add_entrypoint_object(
     dlopen.h
   DEPENDS
     libc.include.dlfcn
-    libc.src.errno.errno
 )
 
 add_entrypoint_object(
@@ -36,7 +34,6 @@ add_entrypoint_object(
     dlsym.h
   DEPENDS
     libc.include.dlfcn
-    libc.src.errno.errno
 )
 
 add_entrypoint_object(
@@ -47,7 +44,6 @@ add_entrypoint_object(
     dlinfo.h
   DEPENDS
     libc.include.dlfcn
-    libc.src.errno.errno
 )
 
 add_entrypoint_object(
@@ -58,5 +54,4 @@ add_entrypoint_object(
     dladdr.h
   DEPENDS
     libc.include.dlfcn
-    libc.src.errno.errno
 )
diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index 8802c8c..838ca4d 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -1396,9 +1396,6 @@ void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
 // change in section sizes can have cascading effect and require another
 // relaxation pass.
 bool LoongArch::relaxOnce(int pass) const {
-  if (ctx.arg.relocatable)
-    return false;
-
   if (pass == 0)
     initSymbolAnchors(ctx);
 
diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index 72d8315..ba0584b 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -942,9 +942,6 @@ static bool relax(Ctx &ctx, int pass, InputSection &sec) {
 // relaxation pass.
 bool RISCV::relaxOnce(int pass) const {
   llvm::TimeTraceScope timeScope("RISC-V relaxOnce");
-  if (ctx.arg.relocatable)
-    return false;
-
   if (pass == 0)
     initSymbolAnchors(ctx);
 
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 2b0e097..4fa8039 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -1541,8 +1541,10 @@ template <class ELFT> void Writer<ELFT>::finalizeAddressDependentContent() {
   if (ctx.arg.randomizeSectionPadding)
     randomizeSectionPadding(ctx);
 
+  // Iterate until a fixed point is reached, skipping relocatable links since
+  // the final addresses are unavailable.
   uint32_t pass = 0, assignPasses = 0;
-  for (;;) {
+  while (!ctx.arg.relocatable) {
     bool changed = ctx.target->needsThunks
                        ? tc.createThunks(pass, ctx.outputSections)
                        : ctx.target->relaxOnce(pass);
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 9eb391c..f11c65a 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1635,27 +1635,21 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
 
   config->osoPrefix = args.getLastArgValue(OPT_oso_prefix);
   if (!config->osoPrefix.empty()) {
-    // Expand special characters, such as ".", "..", or  "~", if present.
-    // Note: LD64 only expands "." and not other special characters.
-    // That seems silly to imitate so we will not try to follow it, but rather
-    // just use real_path() to do it.
-
     // The max path length is 4096, in theory. However that seems quite long
     // and seems unlikely that any one would want to strip everything from the
     // path. Hence we've picked a reasonably large number here.
     SmallString<1024> expanded;
-    if (!fs::real_path(config->osoPrefix, expanded,
-                       /*expand_tilde=*/true)) {
-      // Note: LD64 expands "." to be `<current_dir>/`
-      // (ie., it has a slash suffix) whereas real_path() doesn't.
-      // So we have to append '/' to be consistent.
-      StringRef sep = sys::path::get_separator();
-      // real_path removes trailing slashes as part of the normalization, but
-      // these are meaningful for our text based stripping
-      if (config->osoPrefix == "." || config->osoPrefix.ends_with(sep))
-        expanded += sep;
-      config->osoPrefix = saver().save(expanded.str());
+    // Expand "." into the current working directory.
+    if (config->osoPrefix == "." && !fs::current_path(expanded)) {
+      // Note: LD64 expands "." to be `<current_dir>/
+      // (ie., it has a slash suffix) whereas current_path() doesn't.
+      // So we have to append '/' to be consistent because this is
+      // meaningful for our text based stripping.
+      expanded += sys::path::get_separator();
+    } else {
+      expanded = config->osoPrefix;
     }
+    config->osoPrefix = saver().save(expanded.str());
   }
 
   bool pie = args.hasFlag(OPT_pie, OPT_no_pie, true);
diff --git a/lld/test/MachO/stabs.s b/lld/test/MachO/stabs.s
index 968656d..e32b9fc 100644
--- a/lld/test/MachO/stabs.s
+++ b/lld/test/MachO/stabs.s
@@ -63,9 +63,18 @@
 # RUN: dsymutil -s  %t/test-rel | grep 'N_OSO' | FileCheck %s  -D#TEST_TIME=0x10 -D#FOO_TIME=0x20 --check-prefix=REL-PATH-NO-SLASH
 # RUN: cd %t && ZERO_AR_DATE=0 %lld -lSystem test.o foo.o no-debug.o -oso_prefix "." -o %t/test-rel-dot
 # RUN: dsymutil -s  %t/test-rel-dot | grep 'N_OSO' | FileCheck %s  -D#TEST_TIME=0x10 -D#FOO_TIME=0x20 --check-prefix=REL-DOT
-## Set HOME to %t (for ~ to expand to)
-# RUN: cd %t && env HOME=%t ZERO_AR_DATE=0 %lld -lSystem test.o foo.o no-debug.o -oso_prefix "~" -o %t/test-rel-tilde
-# RUN: dsymutil -s  %t/test-rel-tilde | grep 'N_OSO' | FileCheck %s  -D#TEST_TIME=0x10 -D#FOO_TIME=0x20 --check-prefix=REL-PATH
+# RUN: cd %t && ZERO_AR_DATE=0 %lld -lSystem ./test.o ./foo.o ./no-debug.o -oso_prefix "." -o %t/test-rel-dot
+# RUN: dsymutil -s  %t/test-rel-dot | grep 'N_OSO' | FileCheck %s  -D#TEST_TIME=0x10 -D#FOO_TIME=0x20 --check-prefix=REL-DOT-EXPLICIT
+
+## Check that symlinks are not expanded when -oso_prefix . is used.
+# RUN: mkdir -p %t/private/var/folders/tmp && ln -s private/var %t/var
+# RUN: cp %t/test.o %t/foo.o %t/no-debug.o %t/private/var/folders/tmp
+# RUN: env TZ=GMT touch -t "197001010000.16" %t/private/var/folders/tmp/test.o
+# RUN: env TZ=GMT touch -t "197001010000.32" %t/private/var/folders/tmp/foo.o
+# RUN: cd %t/var/folders/tmp && ZERO_AR_DATE=0 %lld -lSystem test.o foo.o no-debug.o -oso_prefix "." -o test-rel-symlink
+# RUN: dsymutil -s  %t/private/var/folders/tmp/test-rel-symlink | grep 'N_OSO' | FileCheck %s  -D#TEST_TIME=0x10 -D#FOO_TIME=0x20 --check-prefix=REL-DOT
+# RUN: cd %t/var/folders/tmp && ZERO_AR_DATE=0 %lld -lSystem ./test.o ./foo.o ./no-debug.o -oso_prefix "." -o test-rel-symlink
+# RUN: dsymutil -s  %t/private/var/folders/tmp/test-rel-symlink | grep 'N_OSO' | FileCheck %s  -D#TEST_TIME=0x10 -D#FOO_TIME=0x20 --check-prefix=REL-DOT-EXPLICIT
 
 ## Check that we don't emit DWARF or stabs when -S is used
 # RUN: %lld -lSystem test.o foo.o no-debug.o -S -o %t/test-no-debug
@@ -91,6 +100,7 @@
 # REL-PATH:   (N_OSO        ) 03                         0001   [[#%.16x,TEST_TIME]] '/test.o'
 # REL-PATH-NO-SLASH:  (N_OSO        ) 03                 0001   [[#%.16x,TEST_TIME]] 'test.o'
 # REL-DOT:    (N_OSO        ) 03                         0001   [[#%.16x,TEST_TIME]] 'test.o'
+# REL-DOT-EXPLICIT:    (N_OSO        ) 03                0001   [[#%.16x,TEST_TIME]] './test.o'
 # CHECK-NEXT: (N_STSYM      ) [[#%.2d,MORE_DATA_ID + 1]] 0000   [[#%.16x,STATIC:]] '_static_var'
 # CHECK-NEXT: (N_FUN        ) [[#%.2d,TEXT_ID + 1]]      0000   [[#%.16x,MAIN:]]   '_main'
 # CHECK-NEXT: (N_FUN        ) 00                         0000   0000000000000006{{$}}
diff --git a/lldb/include/lldb/Protocol/MCP/Protocol.h b/lldb/include/lldb/Protocol/MCP/Protocol.h
index c43b068..6448416 100644
--- a/lldb/include/lldb/Protocol/MCP/Protocol.h
+++ b/lldb/include/lldb/Protocol/MCP/Protocol.h
@@ -21,7 +21,7 @@
 
 namespace lldb_protocol::mcp {
 
-static llvm::StringLiteral kVersion = "2024-11-05";
+static llvm::StringLiteral kProtocolVersion = "2024-11-05";
 
 /// A request that expects a response.
 struct Request {
diff --git a/lldb/include/lldb/Protocol/MCP/Server.h b/lldb/include/lldb/Protocol/MCP/Server.h
new file mode 100644
index 0000000..2ac0588
--- /dev/null
+++ b/lldb/include/lldb/Protocol/MCP/Server.h
@@ -0,0 +1,70 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_PROTOCOL_MCP_SERVER_H
+#define LLDB_PROTOCOL_MCP_SERVER_H
+
+#include "lldb/Protocol/MCP/Protocol.h"
+#include "lldb/Protocol/MCP/Resource.h"
+#include "lldb/Protocol/MCP/Tool.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/Error.h"
+#include <mutex>
+
+namespace lldb_protocol::mcp {
+
+class Server {
+public:
+  Server(std::string name, std::string version);
+  virtual ~Server() = default;
+
+  void AddTool(std::unique_ptr<Tool> tool);
+  void AddResourceProvider(std::unique_ptr<ResourceProvider> resource_provider);
+
+protected:
+  virtual Capabilities GetCapabilities() = 0;
+
+  using RequestHandler =
+      std::function<llvm::Expected<Response>(const Request &)>;
+  using NotificationHandler = std::function<void(const Notification &)>;
+
+  void AddRequestHandlers();
+
+  void AddRequestHandler(llvm::StringRef method, RequestHandler handler);
+  void AddNotificationHandler(llvm::StringRef method,
+                              NotificationHandler handler);
+
+  llvm::Expected<std::optional<Message>> HandleData(llvm::StringRef data);
+
+  llvm::Expected<Response> Handle(Request request);
+  void Handle(Notification notification);
+
+  llvm::Expected<Response> InitializeHandler(const Request &);
+
+  llvm::Expected<Response> ToolsListHandler(const Request &);
+  llvm::Expected<Response> ToolsCallHandler(const Request &);
+
+  llvm::Expected<Response> ResourcesListHandler(const Request &);
+  llvm::Expected<Response> ResourcesReadHandler(const Request &);
+
+  std::mutex m_mutex;
+
+private:
+  const std::string m_name;
+  const std::string m_version;
+
+  llvm::StringMap<std::unique_ptr<Tool>> m_tools;
+  std::vector<std::unique_ptr<ResourceProvider>> m_resource_providers;
+
+  llvm::StringMap<RequestHandler> m_request_handlers;
+  llvm::StringMap<NotificationHandler> m_notification_handlers;
+};
+
+} // namespace lldb_protocol::mcp
+
+#endif
diff --git a/lldb/source/Core/DemangledNameInfo.cpp b/lldb/source/Core/DemangledNameInfo.cpp
index 00227f0..76f8987 100644
--- a/lldb/source/Core/DemangledNameInfo.cpp
+++ b/lldb/source/Core/DemangledNameInfo.cpp
@@ -111,6 +111,11 @@ void TrackingOutputBuffer::finalizeEnd() {
   if (NameInfo.ScopeRange.first > NameInfo.ScopeRange.second)
     NameInfo.ScopeRange.second = NameInfo.ScopeRange.first;
   NameInfo.BasenameRange.first = NameInfo.ScopeRange.second;
+
+  // We call anything past the FunctionEncoding the "suffix".
+  // In practice this would be nodes like `DotSuffix` that wrap
+  // a FunctionEncoding.
+  NameInfo.SuffixRange.first = getCurrentPosition();
 }
 
 ScopedOverride<unsigned> TrackingOutputBuffer::enterFunctionTypePrinting() {
@@ -138,6 +143,9 @@ void TrackingOutputBuffer::printLeft(const Node &N) {
   default:
     OutputBuffer::printLeft(N);
   }
+
+  // Keep updating suffix until we reach the end.
+  NameInfo.SuffixRange.second = getCurrentPosition();
 }
 
 void TrackingOutputBuffer::printRight(const Node &N) {
@@ -151,6 +159,9 @@ void TrackingOutputBuffer::printRight(const Node &N) {
   default:
     OutputBuffer::printRight(N);
   }
+
+  // Keep updating suffix until we reach the end.
+  NameInfo.SuffixRange.second = getCurrentPosition();
 }
 
 void TrackingOutputBuffer::printLeftImpl(const FunctionType &N) {
diff --git a/lldb/source/Core/Mangled.cpp b/lldb/source/Core/Mangled.cpp
index 3663f43..ce4db4e 100644
--- a/lldb/source/Core/Mangled.cpp
+++ b/lldb/source/Core/Mangled.cpp
@@ -172,9 +172,6 @@ GetItaniumDemangledStr(const char *M) {
 
     TrackingOutputBuffer OB(demangled_cstr, demangled_size);
     demangled_cstr = ipd.finishDemangle(&OB);
-    // TODO: we should set the SuffixRange inside the TrackingOutputBuffer.
-    OB.NameInfo.SuffixRange.first = OB.NameInfo.QualifiersRange.second;
-    OB.NameInfo.SuffixRange.second = std::string_view(OB).size();
     info = std::move(OB.NameInfo);
 
     assert(demangled_cstr &&
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index 3bc8708..3118ff1 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -392,13 +392,16 @@ GetDemangledScope(const SymbolContext &sc) {
   return CPlusPlusLanguage::GetDemangledScope(demangled_name, info);
 }
 
-/// Handles anything printed after the FunctionEncoding ItaniumDemangle
-/// node. Most notably the DotSuffix node.
-///
-/// FIXME: the suffix should also have an associated
-/// CPlusPlusLanguage::GetDemangledFunctionSuffix
-/// once we start setting the `DemangledNameInfo::SuffixRange`
-/// from inside the `TrackingOutputBuffer`.
+llvm::Expected<llvm::StringRef>
+CPlusPlusLanguage::GetDemangledFunctionSuffix(llvm::StringRef demangled,
+                                              const DemangledNameInfo &info) {
+  if (!info.hasSuffix())
+    return llvm::createStringError("Suffix range for '%s' is invalid.",
+                                   demangled.data());
+
+  return demangled.slice(info.SuffixRange.first, info.SuffixRange.second);
+}
+
 static llvm::Expected<llvm::StringRef>
 GetDemangledFunctionSuffix(const SymbolContext &sc) {
   auto info_or_err = GetAndValidateInfo(sc);
@@ -407,11 +410,7 @@ GetDemangledFunctionSuffix(const SymbolContext &sc) {
 
   auto [demangled_name, info] = *info_or_err;
 
-  if (!info.hasSuffix())
-    return llvm::createStringError("Suffix range for '%s' is invalid.",
-                                   demangled_name.data());
-
-  return demangled_name.slice(info.SuffixRange.first, info.SuffixRange.second);
+  return CPlusPlusLanguage::GetDemangledFunctionSuffix(demangled_name, info);
 }
 
 llvm::Expected<llvm::StringRef>
@@ -2424,7 +2423,7 @@ bool CPlusPlusLanguage::HandleFrameFormatVariable(
     return true;
   }
   case FormatEntity::Entry::Type::FunctionSuffix: {
-    auto suffix_or_err = GetDemangledFunctionSuffix(sc);
+    auto suffix_or_err = ::GetDemangledFunctionSuffix(sc);
     if (!suffix_or_err) {
       LLDB_LOG_ERROR(
           GetLog(LLDBLog::Language), suffix_or_err.takeError(),
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h
index 4f449f1..4a30299 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h
@@ -138,6 +138,10 @@ public:
   GetDemangledFunctionArguments(llvm::StringRef demangled,
                                 const DemangledNameInfo &info);
 
+  static llvm::Expected<llvm::StringRef>
+  GetDemangledFunctionSuffix(llvm::StringRef demangled,
+                             const DemangledNameInfo &info);
+
   // Extract C++ context and identifier from a string using heuristic matching
   // (as opposed to
   // CPlusPlusLanguage::CxxMethodName which has to have a fully qualified C++
diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h
index 021ec49..bca5bff 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h
+++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h
@@ -46,9 +46,9 @@ public:
 
   // based on RegisterContextDarwin_arm64.h
   // Pack this so there are no extra bytes, but align its start address to at
-  // least 4 bytes to prevent alignment errors on Arm 32-bit.
+  // least 8 bytes to prevent alignment errors.
   LLVM_PACKED_START
-  struct alignas(4) GPR {
+  struct alignas(8) GPR {
     uint64_t x[29]; // x0-x28
     uint64_t fp;    // x29
     uint64_t lr;    // x30
diff --git a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp
index c9fe474..c359663 100644
--- a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp
+++ b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp
@@ -27,25 +27,12 @@ using namespace llvm;
 LLDB_PLUGIN_DEFINE(ProtocolServerMCP)
 
 static constexpr size_t kChunkSize = 1024;
+static constexpr llvm::StringLiteral kName = "lldb-mcp";
+static constexpr llvm::StringLiteral kVersion = "0.1.0";
 
-ProtocolServerMCP::ProtocolServerMCP() : ProtocolServer() {
-  AddRequestHandler("initialize",
-                    std::bind(&ProtocolServerMCP::InitializeHandler, this,
-                              std::placeholders::_1));
-
-  AddRequestHandler("tools/list",
-                    std::bind(&ProtocolServerMCP::ToolsListHandler, this,
-                              std::placeholders::_1));
-  AddRequestHandler("tools/call",
-                    std::bind(&ProtocolServerMCP::ToolsCallHandler, this,
-                              std::placeholders::_1));
-
-  AddRequestHandler("resources/list",
-                    std::bind(&ProtocolServerMCP::ResourcesListHandler, this,
-                              std::placeholders::_1));
-  AddRequestHandler("resources/read",
-                    std::bind(&ProtocolServerMCP::ResourcesReadHandler, this,
-                              std::placeholders::_1));
+ProtocolServerMCP::ProtocolServerMCP()
+    : ProtocolServer(),
+      lldb_protocol::mcp::Server(std::string(kName), std::string(kVersion)) {
   AddNotificationHandler("notifications/initialized",
                          [](const lldb_protocol::mcp::Notification &) {
                            LLDB_LOG(GetLog(LLDBLog::Host),
@@ -77,32 +64,6 @@ llvm::StringRef ProtocolServerMCP::GetPluginDescriptionStatic() {
   return "MCP Server.";
 }
 
-llvm::Expected<lldb_protocol::mcp::Response>
-ProtocolServerMCP::Handle(lldb_protocol::mcp::Request request) {
-  auto it = m_request_handlers.find(request.method);
-  if (it != m_request_handlers.end()) {
-    llvm::Expected<lldb_protocol::mcp::Response> response = it->second(request);
-    if (!response)
-      return response;
-    response->id = request.id;
-    return *response;
-  }
-
-  return make_error<MCPError>(
-      llvm::formatv("no handler for request: {0}", request.method).str());
-}
-
-void ProtocolServerMCP::Handle(lldb_protocol::mcp::Notification notification) {
-  auto it = m_notification_handlers.find(notification.method);
-  if (it != m_notification_handlers.end()) {
-    it->second(notification);
-    return;
-  }
-
-  LLDB_LOG(GetLog(LLDBLog::Host), "MPC notification: {0} ({1})",
-           notification.method, notification.params);
-}
-
 void ProtocolServerMCP::AcceptCallback(std::unique_ptr<Socket> socket) {
   LLDB_LOG(GetLog(LLDBLog::Host), "New MCP client ({0}) connected",
            m_clients.size() + 1);
@@ -157,7 +118,7 @@ llvm::Error ProtocolServerMCP::ReadCallback(Client &client) {
 }
 
 llvm::Error ProtocolServerMCP::Start(ProtocolServer::Connection connection) {
-  std::lock_guard<std::mutex> guard(m_server_mutex);
+  std::lock_guard<std::mutex> guard(m_mutex);
 
   if (m_running)
     return llvm::createStringError("the MCP server is already running");
@@ -189,7 +150,7 @@ llvm::Error ProtocolServerMCP::Start(ProtocolServer::Connection connection) {
 
 llvm::Error ProtocolServerMCP::Stop() {
   {
-    std::lock_guard<std::mutex> guard(m_server_mutex);
+    std::lock_guard<std::mutex> guard(m_mutex);
     if (!m_running)
       return createStringError("the MCP sever is not running");
     m_running = false;
@@ -204,7 +165,7 @@ llvm::Error ProtocolServerMCP::Stop() {
     m_loop_thread.join();
 
   {
-    std::lock_guard<std::mutex> guard(m_server_mutex);
+    std::lock_guard<std::mutex> guard(m_mutex);
     m_listener.reset();
     m_listen_handlers.clear();
     m_clients.clear();
@@ -213,48 +174,6 @@ llvm::Error ProtocolServerMCP::Stop() {
   return llvm::Error::success();
 }
 
-llvm::Expected<std::optional<lldb_protocol::mcp::Message>>
-ProtocolServerMCP::HandleData(llvm::StringRef data) {
-  auto message = llvm::json::parse<lldb_protocol::mcp::Message>(/*JSON=*/data);
-  if (!message)
-    return message.takeError();
-
-  if (const lldb_protocol::mcp::Request *request =
-          std::get_if<lldb_protocol::mcp::Request>(&(*message))) {
-    llvm::Expected<lldb_protocol::mcp::Response> response = Handle(*request);
-
-    // Handle failures by converting them into an Error message.
-    if (!response) {
-      lldb_protocol::mcp::Error protocol_error;
-      llvm::handleAllErrors(
-          response.takeError(),
-          [&](const MCPError &err) { protocol_error = err.toProtcolError(); },
-          [&](const llvm::ErrorInfoBase &err) {
-            protocol_error.error.code = MCPError::kInternalError;
-            protocol_error.error.message = err.message();
-          });
-      protocol_error.id = request->id;
-      return protocol_error;
-    }
-
-    return *response;
-  }
-
-  if (const lldb_protocol::mcp::Notification *notification =
-          std::get_if<lldb_protocol::mcp::Notification>(&(*message))) {
-    Handle(*notification);
-    return std::nullopt;
-  }
-
-  if (std::get_if<lldb_protocol::mcp::Error>(&(*message)))
-    return llvm::createStringError("unexpected MCP message: error");
-
-  if (std::get_if<lldb_protocol::mcp::Response>(&(*message)))
-    return llvm::createStringError("unexpected MCP message: response");
-
-  llvm_unreachable("all message types handled");
-}
-
 lldb_protocol::mcp::Capabilities ProtocolServerMCP::GetCapabilities() {
   lldb_protocol::mcp::Capabilities capabilities;
   capabilities.tools.listChanged = true;
@@ -263,158 +182,3 @@ lldb_protocol::mcp::Capabilities ProtocolServerMCP::GetCapabilities() {
   capabilities.resources.listChanged = false;
   return capabilities;
 }
-
-void ProtocolServerMCP::AddTool(std::unique_ptr<Tool> tool) {
-  std::lock_guard<std::mutex> guard(m_server_mutex);
-
-  if (!tool)
-    return;
-  m_tools[tool->GetName()] = std::move(tool);
-}
-
-void ProtocolServerMCP::AddResourceProvider(
-    std::unique_ptr<ResourceProvider> resource_provider) {
-  std::lock_guard<std::mutex> guard(m_server_mutex);
-
-  if (!resource_provider)
-    return;
-  m_resource_providers.push_back(std::move(resource_provider));
-}
-
-void ProtocolServerMCP::AddRequestHandler(llvm::StringRef method,
-                                          RequestHandler handler) {
-  std::lock_guard<std::mutex> guard(m_server_mutex);
-  m_request_handlers[method] = std::move(handler);
-}
-
-void ProtocolServerMCP::AddNotificationHandler(llvm::StringRef method,
-                                               NotificationHandler handler) {
-  std::lock_guard<std::mutex> guard(m_server_mutex);
-  m_notification_handlers[method] = std::move(handler);
-}
-
-llvm::Expected<lldb_protocol::mcp::Response>
-ProtocolServerMCP::InitializeHandler(
-    const lldb_protocol::mcp::Request &request) {
-  lldb_protocol::mcp::Response response;
-  response.result.emplace(llvm::json::Object{
-      {"protocolVersion", lldb_protocol::mcp::kVersion},
-      {"capabilities", GetCapabilities()},
-      {"serverInfo",
-       llvm::json::Object{{"name", kName}, {"version", kVersion}}}});
-  return response;
-}
-
-llvm::Expected<lldb_protocol::mcp::Response>
-ProtocolServerMCP::ToolsListHandler(
-    const lldb_protocol::mcp::Request &request) {
-  lldb_protocol::mcp::Response response;
-
-  llvm::json::Array tools;
-  for (const auto &tool : m_tools)
-    tools.emplace_back(toJSON(tool.second->GetDefinition()));
-
-  response.result.emplace(llvm::json::Object{{"tools", std::move(tools)}});
-
-  return response;
-}
-
-llvm::Expected<lldb_protocol::mcp::Response>
-ProtocolServerMCP::ToolsCallHandler(
-    const lldb_protocol::mcp::Request &request) {
-  lldb_protocol::mcp::Response response;
-
-  if (!request.params)
-    return llvm::createStringError("no tool parameters");
-
-  const json::Object *param_obj = request.params->getAsObject();
-  if (!param_obj)
-    return llvm::createStringError("no tool parameters");
-
-  const json::Value *name = param_obj->get("name");
-  if (!name)
-    return llvm::createStringError("no tool name");
-
-  llvm::StringRef tool_name = name->getAsString().value_or("");
-  if (tool_name.empty())
-    return llvm::createStringError("no tool name");
-
-  auto it = m_tools.find(tool_name);
-  if (it == m_tools.end())
-    return llvm::createStringError(llvm::formatv("no tool \"{0}\"", tool_name));
-
-  lldb_protocol::mcp::ToolArguments tool_args;
-  if (const json::Value *args = param_obj->get("arguments"))
-    tool_args = *args;
-
-  llvm::Expected<lldb_protocol::mcp::TextResult> text_result =
-      it->second->Call(tool_args);
-  if (!text_result)
-    return text_result.takeError();
-
-  response.result.emplace(toJSON(*text_result));
-
-  return response;
-}
-
-llvm::Expected<lldb_protocol::mcp::Response>
-ProtocolServerMCP::ResourcesListHandler(
-    const lldb_protocol::mcp::Request &request) {
-  lldb_protocol::mcp::Response response;
-
-  llvm::json::Array resources;
-
-  std::lock_guard<std::mutex> guard(m_server_mutex);
-  for (std::unique_ptr<ResourceProvider> &resource_provider_up :
-       m_resource_providers) {
-    for (const lldb_protocol::mcp::Resource &resource :
-         resource_provider_up->GetResources())
-      resources.push_back(resource);
-  }
-  response.result.emplace(
-      llvm::json::Object{{"resources", std::move(resources)}});
-
-  return response;
-}
-
-llvm::Expected<lldb_protocol::mcp::Response>
-ProtocolServerMCP::ResourcesReadHandler(
-    const lldb_protocol::mcp::Request &request) {
-  lldb_protocol::mcp::Response response;
-
-  if (!request.params)
-    return llvm::createStringError("no resource parameters");
-
-  const json::Object *param_obj = request.params->getAsObject();
-  if (!param_obj)
-    return llvm::createStringError("no resource parameters");
-
-  const json::Value *uri = param_obj->get("uri");
-  if (!uri)
-    return llvm::createStringError("no resource uri");
-
-  llvm::StringRef uri_str = uri->getAsString().value_or("");
-  if (uri_str.empty())
-    return llvm::createStringError("no resource uri");
-
-  std::lock_guard<std::mutex> guard(m_server_mutex);
-  for (std::unique_ptr<ResourceProvider> &resource_provider_up :
-       m_resource_providers) {
-    llvm::Expected<lldb_protocol::mcp::ResourceResult> result =
-        resource_provider_up->ReadResource(uri_str);
-    if (result.errorIsA<UnsupportedURI>()) {
-      llvm::consumeError(result.takeError());
-      continue;
-    }
-    if (!result)
-      return result.takeError();
-
-    lldb_protocol::mcp::Response response;
-    response.result.emplace(std::move(*result));
-    return response;
-  }
-
-  return make_error<MCPError>(
-      llvm::formatv("no resource handler for uri: {0}", uri_str).str(),
-      MCPError::kResourceNotFound);
-}
diff --git a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.h b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.h
index 2ea9585..7fe909a 100644
--- a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.h
+++ b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.h
@@ -13,14 +13,13 @@
 #include "lldb/Host/MainLoop.h"
 #include "lldb/Host/Socket.h"
 #include "lldb/Protocol/MCP/Protocol.h"
-#include "lldb/Protocol/MCP/Resource.h"
-#include "lldb/Protocol/MCP/Tool.h"
-#include "llvm/ADT/StringMap.h"
+#include "lldb/Protocol/MCP/Server.h"
 #include <thread>
 
 namespace lldb_private::mcp {
 
-class ProtocolServerMCP : public ProtocolServer {
+class ProtocolServerMCP : public ProtocolServer,
+                          public lldb_protocol::mcp::Server {
 public:
   ProtocolServerMCP();
   virtual ~ProtocolServerMCP() override;
@@ -40,48 +39,10 @@ public:
 
   Socket *GetSocket() const override { return m_listener.get(); }
 
-protected:
-  using RequestHandler =
-      std::function<llvm::Expected<lldb_protocol::mcp::Response>(
-          const lldb_protocol::mcp::Request &)>;
-  using NotificationHandler =
-      std::function<void(const lldb_protocol::mcp::Notification &)>;
-
-  void AddTool(std::unique_ptr<lldb_protocol::mcp::Tool> tool);
-  void AddResourceProvider(
-      std::unique_ptr<lldb_protocol::mcp::ResourceProvider> resource_provider);
-
-  void AddRequestHandler(llvm::StringRef method, RequestHandler handler);
-  void AddNotificationHandler(llvm::StringRef method,
-                              NotificationHandler handler);
-
 private:
   void AcceptCallback(std::unique_ptr<Socket> socket);
 
-  llvm::Expected<std::optional<lldb_protocol::mcp::Message>>
-  HandleData(llvm::StringRef data);
-
-  llvm::Expected<lldb_protocol::mcp::Response>
-  Handle(lldb_protocol::mcp::Request request);
-  void Handle(lldb_protocol::mcp::Notification notification);
-
-  llvm::Expected<lldb_protocol::mcp::Response>
-  InitializeHandler(const lldb_protocol::mcp::Request &);
-
-  llvm::Expected<lldb_protocol::mcp::Response>
-  ToolsListHandler(const lldb_protocol::mcp::Request &);
-  llvm::Expected<lldb_protocol::mcp::Response>
-  ToolsCallHandler(const lldb_protocol::mcp::Request &);
-
-  llvm::Expected<lldb_protocol::mcp::Response>
-  ResourcesListHandler(const lldb_protocol::mcp::Request &);
-  llvm::Expected<lldb_protocol::mcp::Response>
-  ResourcesReadHandler(const lldb_protocol::mcp::Request &);
-
-  lldb_protocol::mcp::Capabilities GetCapabilities();
-
-  llvm::StringLiteral kName = "lldb-mcp";
-  llvm::StringLiteral kVersion = "0.1.0";
+  lldb_protocol::mcp::Capabilities GetCapabilities() override;
 
   bool m_running = false;
 
@@ -98,14 +59,6 @@ private:
   };
   llvm::Error ReadCallback(Client &client);
   std::vector<std::unique_ptr<Client>> m_clients;
-
-  std::mutex m_server_mutex;
-  llvm::StringMap<std::unique_ptr<lldb_protocol::mcp::Tool>> m_tools;
-  std::vector<std::unique_ptr<lldb_protocol::mcp::ResourceProvider>>
-      m_resource_providers;
-
-  llvm::StringMap<RequestHandler> m_request_handlers;
-  llvm::StringMap<NotificationHandler> m_notification_handlers;
 };
 } // namespace lldb_private::mcp
 
diff --git a/lldb/source/Protocol/MCP/CMakeLists.txt b/lldb/source/Protocol/MCP/CMakeLists.txt
index f1b1098..a73e7e6 100644
--- a/lldb/source/Protocol/MCP/CMakeLists.txt
+++ b/lldb/source/Protocol/MCP/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_lldb_library(lldbProtocolMCP NO_PLUGIN_DEPENDENCIES
   MCPError.cpp
   Protocol.cpp
+  Server.cpp
   Tool.cpp
 
   LINK_COMPONENTS
diff --git a/lldb/source/Protocol/MCP/Server.cpp b/lldb/source/Protocol/MCP/Server.cpp
new file mode 100644
index 0000000..4ec127fe
--- /dev/null
+++ b/lldb/source/Protocol/MCP/Server.cpp
@@ -0,0 +1,236 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Protocol/MCP/Server.h"
+#include "lldb/Protocol/MCP/MCPError.h"
+
+using namespace lldb_protocol::mcp;
+using namespace llvm;
+
+Server::Server(std::string name, std::string version)
+    : m_name(std::move(name)), m_version(std::move(version)) {
+  AddRequestHandlers();
+}
+
+void Server::AddRequestHandlers() {
+  AddRequestHandler("initialize", std::bind(&Server::InitializeHandler, this,
+                                            std::placeholders::_1));
+  AddRequestHandler("tools/list", std::bind(&Server::ToolsListHandler, this,
+                                            std::placeholders::_1));
+  AddRequestHandler("tools/call", std::bind(&Server::ToolsCallHandler, this,
+                                            std::placeholders::_1));
+  AddRequestHandler("resources/list", std::bind(&Server::ResourcesListHandler,
+                                                this, std::placeholders::_1));
+  AddRequestHandler("resources/read", std::bind(&Server::ResourcesReadHandler,
+                                                this, std::placeholders::_1));
+}
+
+llvm::Expected<Response> Server::Handle(Request request) {
+  auto it = m_request_handlers.find(request.method);
+  if (it != m_request_handlers.end()) {
+    llvm::Expected<Response> response = it->second(request);
+    if (!response)
+      return response;
+    response->id = request.id;
+    return *response;
+  }
+
+  return llvm::make_error<MCPError>(
+      llvm::formatv("no handler for request: {0}", request.method).str());
+}
+
+void Server::Handle(Notification notification) {
+  auto it = m_notification_handlers.find(notification.method);
+  if (it != m_notification_handlers.end()) {
+    it->second(notification);
+    return;
+  }
+}
+
+llvm::Expected<std::optional<Message>>
+Server::HandleData(llvm::StringRef data) {
+  auto message = llvm::json::parse<Message>(/*JSON=*/data);
+  if (!message)
+    return message.takeError();
+
+  if (const Request *request = std::get_if<Request>(&(*message))) {
+    llvm::Expected<Response> response = Handle(*request);
+
+    // Handle failures by converting them into an Error message.
+    if (!response) {
+      Error protocol_error;
+      llvm::handleAllErrors(
+          response.takeError(),
+          [&](const MCPError &err) { protocol_error = err.toProtcolError(); },
+          [&](const llvm::ErrorInfoBase &err) {
+            protocol_error.error.code = MCPError::kInternalError;
+            protocol_error.error.message = err.message();
+          });
+      protocol_error.id = request->id;
+      return protocol_error;
+    }
+
+    return *response;
+  }
+
+  if (const Notification *notification =
+          std::get_if<Notification>(&(*message))) {
+    Handle(*notification);
+    return std::nullopt;
+  }
+
+  if (std::get_if<Error>(&(*message)))
+    return llvm::createStringError("unexpected MCP message: error");
+
+  if (std::get_if<Response>(&(*message)))
+    return llvm::createStringError("unexpected MCP message: response");
+
+  llvm_unreachable("all message types handled");
+}
+
+void Server::AddTool(std::unique_ptr<Tool> tool) {
+  std::lock_guard<std::mutex> guard(m_mutex);
+
+  if (!tool)
+    return;
+  m_tools[tool->GetName()] = std::move(tool);
+}
+
+void Server::AddResourceProvider(
+    std::unique_ptr<ResourceProvider> resource_provider) {
+  std::lock_guard<std::mutex> guard(m_mutex);
+
+  if (!resource_provider)
+    return;
+  m_resource_providers.push_back(std::move(resource_provider));
+}
+
+void Server::AddRequestHandler(llvm::StringRef method, RequestHandler handler) {
+  std::lock_guard<std::mutex> guard(m_mutex);
+  m_request_handlers[method] = std::move(handler);
+}
+
+void Server::AddNotificationHandler(llvm::StringRef method,
+                                    NotificationHandler handler) {
+  std::lock_guard<std::mutex> guard(m_mutex);
+  m_notification_handlers[method] = std::move(handler);
+}
+
+llvm::Expected<Response> Server::InitializeHandler(const Request &request) {
+  Response response;
+  response.result.emplace(llvm::json::Object{
+      {"protocolVersion", mcp::kProtocolVersion},
+      {"capabilities", GetCapabilities()},
+      {"serverInfo",
+       llvm::json::Object{{"name", m_name}, {"version", m_version}}}});
+  return response;
+}
+
+llvm::Expected<Response> Server::ToolsListHandler(const Request &request) {
+  Response response;
+
+  llvm::json::Array tools;
+  for (const auto &tool : m_tools)
+    tools.emplace_back(toJSON(tool.second->GetDefinition()));
+
+  response.result.emplace(llvm::json::Object{{"tools", std::move(tools)}});
+
+  return response;
+}
+
+llvm::Expected<Response> Server::ToolsCallHandler(const Request &request) {
+  Response response;
+
+  if (!request.params)
+    return llvm::createStringError("no tool parameters");
+
+  const json::Object *param_obj = request.params->getAsObject();
+  if (!param_obj)
+    return llvm::createStringError("no tool parameters");
+
+  const json::Value *name = param_obj->get("name");
+  if (!name)
+    return llvm::createStringError("no tool name");
+
+  llvm::StringRef tool_name = name->getAsString().value_or("");
+  if (tool_name.empty())
+    return llvm::createStringError("no tool name");
+
+  auto it = m_tools.find(tool_name);
+  if (it == m_tools.end())
+    return llvm::createStringError(llvm::formatv("no tool \"{0}\"", tool_name));
+
+  ToolArguments tool_args;
+  if (const json::Value *args = param_obj->get("arguments"))
+    tool_args = *args;
+
+  llvm::Expected<TextResult> text_result = it->second->Call(tool_args);
+  if (!text_result)
+    return text_result.takeError();
+
+  response.result.emplace(toJSON(*text_result));
+
+  return response;
+}
+
+llvm::Expected<Response> Server::ResourcesListHandler(const Request &request) {
+  Response response;
+
+  llvm::json::Array resources;
+
+  std::lock_guard<std::mutex> guard(m_mutex);
+  for (std::unique_ptr<ResourceProvider> &resource_provider_up :
+       m_resource_providers) {
+    for (const Resource &resource : resource_provider_up->GetResources())
+      resources.push_back(resource);
+  }
+  response.result.emplace(
+      llvm::json::Object{{"resources", std::move(resources)}});
+
+  return response;
+}
+
+llvm::Expected<Response> Server::ResourcesReadHandler(const Request &request) {
+  Response response;
+
+  if (!request.params)
+    return llvm::createStringError("no resource parameters");
+
+  const json::Object *param_obj = request.params->getAsObject();
+  if (!param_obj)
+    return llvm::createStringError("no resource parameters");
+
+  const json::Value *uri = param_obj->get("uri");
+  if (!uri)
+    return llvm::createStringError("no resource uri");
+
+  llvm::StringRef uri_str = uri->getAsString().value_or("");
+  if (uri_str.empty())
+    return llvm::createStringError("no resource uri");
+
+  std::lock_guard<std::mutex> guard(m_mutex);
+  for (std::unique_ptr<ResourceProvider> &resource_provider_up :
+       m_resource_providers) {
+    llvm::Expected<ResourceResult> result =
+        resource_provider_up->ReadResource(uri_str);
+    if (result.errorIsA<UnsupportedURI>()) {
+      llvm::consumeError(result.takeError());
+      continue;
+    }
+    if (!result)
+      return result.takeError();
+
+    Response response;
+    response.result.emplace(std::move(*result));
+    return response;
+  }
+
+  return make_error<MCPError>(
+      llvm::formatv("no resource handler for uri: {0}", uri_str).str(),
+      MCPError::kResourceNotFound);
+}
diff --git a/lldb/test/Shell/SymbolFile/PDB/ast-restore.test b/lldb/test/Shell/SymbolFile/PDB/ast-restore.test
index a91364b..f127acd 100644
--- a/lldb/test/Shell/SymbolFile/PDB/ast-restore.test
+++ b/lldb/test/Shell/SymbolFile/PDB/ast-restore.test
@@ -3,13 +3,19 @@ RUN: %build --compiler=msvc --nodefaultlib --output=%t.exe %S/Inputs/AstRestoreT
 RUN: lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=ENUM %s
 RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=ENUM %s
 RUN: lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=GLOBAL %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=GLOBAL %s
 RUN: lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=BASE %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=BASE %s
 RUN: lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=CLASS %s
 RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=CLASS %s
 RUN: lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=INNER %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=INNER %s
 RUN: lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=TEMPLATE %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=TEMPLATE %s
 RUN: lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=FOO %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=FOO %s
 RUN: lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=MAIN %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=MAIN %s
 
 ENUM: Module: {{.*}}
 ENUM: namespace N0 {
diff --git a/lldb/test/Shell/SymbolFile/PDB/calling-conventions-x86.test b/lldb/test/Shell/SymbolFile/PDB/calling-conventions-x86.test
index 065c8b6..ceb7ad1 100644
--- a/lldb/test/Shell/SymbolFile/PDB/calling-conventions-x86.test
+++ b/lldb/test/Shell/SymbolFile/PDB/calling-conventions-x86.test
@@ -1,8 +1,12 @@
 REQUIRES: system-windows, lld, (target-x86 || target-x86_64) 
-RUN: %build --compiler=clang-cl --arch=32 --nodefaultlib --output=%t.exe %S/Inputs/CallingConventionsTest.cpp \
-RUN:  && lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix 32BIT-CHECK %s
-RUN: %build --compiler=clang-cl --arch=64 --nodefaultlib --output=%t.exe %S/Inputs/CallingConventionsTest.cpp \
-RUN:  && lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix 64BIT-CHECK %s
+
+RUN: %build --compiler=clang-cl --arch=32 --nodefaultlib --output=%t-32.exe %S/Inputs/CallingConventionsTest.cpp
+RUN: lldb-test symbols -dump-ast %t-32.exe | FileCheck --check-prefix 32BIT-CHECK %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -dump-ast %t-32.exe | FileCheck --check-prefix 32BIT-CHECK %s
+
+RUN: %build --compiler=clang-cl --arch=64 --nodefaultlib --output=%t-64.exe %S/Inputs/CallingConventionsTest.cpp
+RUN: lldb-test symbols -dump-ast %t-64.exe | FileCheck --check-prefix 64BIT-CHECK %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -dump-ast %t-64.exe | FileCheck --check-prefix 64BIT-CHECK %s
 
 64BIT-CHECK: Module: {{.*}}
 64BIT-CHECK-DAG: int (*FuncCCallPtr)();
diff --git a/lldb/test/Shell/SymbolFile/PDB/vbases.test b/lldb/test/Shell/SymbolFile/PDB/vbases.test
index b58e3ed..e5ab80b 100644
--- a/lldb/test/Shell/SymbolFile/PDB/vbases.test
+++ b/lldb/test/Shell/SymbolFile/PDB/vbases.test
@@ -1,6 +1,7 @@
 REQUIRES: target-windows, lld
 RUN: %build --compiler=clang-cl --output=%t.exe %S/Inputs/VBases.cpp
 RUN: %lldb -b -s %S/Inputs/VBases.script -- %t.exe | FileCheck %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -b -s %S/Inputs/VBases.script -- %t.exe | FileCheck %s
 
 CHECK: {
 CHECK:   A = (a = '\x01')
diff --git a/lldb/unittests/Core/MangledTest.cpp b/lldb/unittests/Core/MangledTest.cpp
index a6e6e75f..cbc0c5d 100644
--- a/lldb/unittests/Core/MangledTest.cpp
+++ b/lldb/unittests/Core/MangledTest.cpp
@@ -889,10 +889,10 @@ TEST_P(DemanglingInfoCorrectnessTestFixutre, Correctness) {
   EXPECT_THAT_EXPECTED(qualifiers, llvm::Succeeded());
   reconstructed_name += *qualifiers;
 
-  // TODO: should retrieve suffix using the plugin too.
-  auto suffix = tracked_name.slice(OB->NameInfo.QualifiersRange.second,
-                                   llvm::StringRef::npos);
-  reconstructed_name += suffix;
+  auto suffix =
+      CPlusPlusLanguage::GetDemangledFunctionSuffix(tracked_name, OB->NameInfo);
+  EXPECT_THAT_EXPECTED(suffix, llvm::Succeeded());
+  reconstructed_name += *suffix;
 
   EXPECT_EQ(reconstructed_name, demangled);
 }
diff --git a/lldb/unittests/Instruction/ARM64/TestAArch64Emulator.cpp b/lldb/unittests/Instruction/ARM64/TestAArch64Emulator.cpp
index 08296da..c6bfe0f 100644
--- a/lldb/unittests/Instruction/ARM64/TestAArch64Emulator.cpp
+++ b/lldb/unittests/Instruction/ARM64/TestAArch64Emulator.cpp
@@ -62,7 +62,7 @@ struct Arch64EmulatorTester : public EmulateInstructionARM64 {
       reg_value.SetUInt64(tester->gpr.pc);
       return true;
     case gpr_cpsr_arm64:
-      reg_value.SetUInt64(tester->gpr.cpsr);
+      reg_value.SetUInt32(tester->gpr.cpsr);
       return true;
     default:
       return false;
@@ -97,7 +97,7 @@ struct Arch64EmulatorTester : public EmulateInstructionARM64 {
       tester->gpr.pc = reg_value.GetAsUInt64();
       return true;
     case gpr_cpsr_arm64:
-      tester->gpr.cpsr = reg_value.GetAsUInt64();
+      tester->gpr.cpsr = reg_value.GetAsUInt32();
       return true;
     default:
       return false;
@@ -112,7 +112,7 @@ struct Arch64EmulatorTester : public EmulateInstructionARM64 {
     assert(addr - tester->memory_offset + length <= sizeof(tester->memory));
     if (addr >= tester->memory_offset &&
         addr - tester->memory_offset + length <= sizeof(tester->memory)) {
-      memcpy(dst, tester->memory + addr - tester->memory_offset, length);
+      memcpy(dst, tester->memory + (addr - tester->memory_offset), length);
       return length;
     }
     return 0;
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 4c70b98..e9a6faa 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -187,28 +187,29 @@ if ("lldb" IN_LIST LLVM_ENABLE_PROJECTS)
 endif ()
 
 if ("libc" IN_LIST LLVM_ENABLE_PROJECTS)
-  message(WARNING "Using LLVM_ENABLE_PROJECTS=libc is deprecated.  Please use "
+  message(WARNING "Using LLVM_ENABLE_PROJECTS=libc is deprecated now, and will "
+    "become a fatal error in a future release. Please use "
     "-DLLVM_ENABLE_RUNTIMES=libc or see the instructions at "
     "https://libc.llvm.org/ for building the runtimes.")
 endif()
 
 if ("compiler-rt" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=compiler-rt is deprecated now, and will "
-    "become a fatal error in the LLVM 21 release.  Please use "
+    "become a fatal error in a future release.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=compiler-rt or see the instructions at "
     "https://compiler-rt.llvm.org/ for building the runtimes.")
 endif()
 
 if ("offload" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=offload is deprecated now, and will "
-    "become a fatal error in the LLVM 21 release.  Please use "
+    "become a fatal error in a future release.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=offload or see the instructions at "
     "https://openmp.llvm.org/ for building the runtimes.")
 endif()
 
 if ("openmp" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=openmp is deprecated now, and will "
-    "become a fatal error in the LLVM 21 release.  Please use "
+    "become a fatal error in a future release.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=openmp or see the instructions at "
     "https://openmp.llvm.org/ for building the runtimes.")
 endif()
@@ -221,7 +222,7 @@ endif ()
 
 if ("libclc" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=libclc is deprecated now, and will "
-    "become a fatal error in the LLVM 21 release.  Please use "
+    "become a fatal error in a future release.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=libclc or see the instructions at "
     "https://libclc.llvm.org/ for building the runtimes.")
 endif()
diff --git a/llvm/cmake/modules/LLVMProcessSources.cmake b/llvm/cmake/modules/LLVMProcessSources.cmake
index cf358a8..0670d60 100644
--- a/llvm/cmake/modules/LLVMProcessSources.cmake
+++ b/llvm/cmake/modules/LLVMProcessSources.cmake
@@ -58,21 +58,6 @@ function(llvm_process_sources OUT_VAR)
   set(sources ${ARG_UNPARSED_ARGUMENTS})
   llvm_check_source_file_list(${sources})
 
-  # Don't generate __SHORT_FILE__ on VS builds as it can prevent build parallelisation.
-  if(NOT CMAKE_GENERATOR MATCHES "Visual Studio")
-    foreach(fn ${sources})
-      get_filename_component(suf ${fn} EXT)
-      if("${suf}" STREQUAL ".cpp" OR "${suf}" STREQUAL ".c")
-        get_filename_component(short_name ${fn} NAME)
-        set_property(
-            SOURCE ${fn}
-            APPEND
-            PROPERTY COMPILE_DEFINITIONS __SHORT_FILE__="${short_name}")
-      endif()
-    endforeach()
-  endif()
-
-
   # This adds .td and .h files to the Visual Studio solution:
   add_td_sources(sources)
   find_all_header_files(hdrs "${ARG_ADDITIONAL_HEADER_DIRS}")
diff --git a/llvm/docs/MergeFunctions.rst b/llvm/docs/MergeFunctions.rst
index 02344bc..c27f603 100644
--- a/llvm/docs/MergeFunctions.rst
+++ b/llvm/docs/MergeFunctions.rst
@@ -7,7 +7,7 @@ MergeFunctions pass, how it works
 
 Introduction
 ============
-Sometimes code contains equal functions, or functions that does exactly the same
+Sometimes code contains equal functions, or functions that do exactly the same
 thing even though they are non-equal on the IR level (e.g.: multiplication on 2
 and 'shl 1'). It could happen due to several reasons: mainly, the usage of
 templates and automatic code generators. Though, sometimes the user itself could
@@ -16,7 +16,7 @@ write the same thing twice :-)
 The main purpose of this pass is to recognize such functions and merge them.
 
 This document is the extension to pass comments and describes the pass logic. It
-describes the algorithm that is used in order to compare functions and
+describes the algorithm used to compare functions and
 explains how we could combine equal functions correctly to keep the module
 valid.
 
@@ -58,7 +58,7 @@ It's especially important to understand chapter 3 of tutorial:
 
 :doc:`tutorial/LangImpl03`
 
-The reader should also know how passes work in LLVM. They could use this
+The reader should also know how passes work in LLVM. They can use this
 article as a reference and start point here:
 
 :doc:`WritingAnLLVMPass`
@@ -68,7 +68,7 @@ debugging and bug-fixing.
 
 Narrative structure
 -------------------
-The article consists of three parts. The first part explains pass functionality
+This article consists of three parts. The first part explains pass functionality
 on the top-level. The second part describes the comparison procedure itself.
 The third part describes the merging process.
 
@@ -130,7 +130,7 @@ access lookup? The answer is: "yes".
 
 Random-access
 """""""""""""
-How it could this be done? Just convert each function to a number, and gather
+How can this be done? Just convert each function to a number, and gather
 all of them in a special hash-table. Functions with equal hashes are equal.
 Good hashing means, that every function part must be taken into account. That
 means we have to convert every function part into some number, and then add it
@@ -190,17 +190,17 @@ The algorithm is pretty simple:
 
 1. Put all module's functions into the *worklist*.
 
-2. Scan *worklist*'s functions twice: first enumerate only strong functions and
+2. Scan *worklist*'s functions twice: first, enumerate only strong functions and
 then only weak ones:
 
    2.1. Loop body: take a function from *worklist*  (call it *FCur*) and try to
    insert it into *FnTree*: check whether *FCur* is equal to one of functions
    in *FnTree*. If there *is* an equal function in *FnTree*
-   (call it *FExists*): merge function *FCur* with *FExists*. Otherwise add
+   (call it *FExists*): merge function *FCur* with *FExists*. Otherwise, add
    the function from the *worklist* to *FnTree*.
 
 3. Once the *worklist* scanning and merging operations are complete, check the
-*Deferred* list. If it is not empty: refill the *worklist* contents with
+*Deferred* list. If it is not empty, refill the *worklist* contents with
 *Deferred* list and redo step 2, if the *Deferred* list is empty, then exit
 from method.
 
@@ -249,14 +249,14 @@ Below, we will use the following operations:
 
 The rest of the article is based on *MergeFunctions.cpp* source code
 (found in *<llvm_dir>/lib/Transforms/IPO/MergeFunctions.cpp*). We would like
-to ask reader to keep this file open, so we could use it as a reference
+to ask the reader to keep this file open, so we could use it as a reference
 for further explanations.
 
 Now, we're ready to proceed to the next chapter and see how it works.
 
 Functions comparison
 ====================
-At first, let's define how exactly we compare complex objects.
+First, let's define exactly how we compare complex objects.
 
 Complex object comparison (function, basic-block, etc) is mostly based on its
 sub-object comparison results. It is similar to the next "tree" objects
@@ -307,7 +307,7 @@ to those we met later in function body (value we met first would be *less*).
 This is done by “``FunctionComparator::cmpValues(const Value*, const Value*)``”
 method (will be described a bit later).
 
-4. Function body comparison. As it written in method comments:
+4. Function body comparison. As written in method comments:
 
 “We do a CFG-ordered walk since the actual ordering of the blocks in the linked
 list is immaterial. Our walk starts at the entry block for both functions, then
@@ -477,7 +477,7 @@ Of course, we can combine insertion and comparison:
     = sn_mapR.insert(std::make_pair(Right, sn_mapR.size()));
   return cmpNumbers(LeftRes.first->second, RightRes.first->second);
 
-Let's look, how whole method could be implemented.
+Let's look at how the whole method could be implemented.
 
 1. We have to start with the bad news. Consider function self and
 cross-referencing cases:
@@ -519,7 +519,7 @@ the result of numbers comparison:
    if (LeftRes.first->second < RightRes.first->second) return -1;
    return 1;
 
-Now when *cmpValues* returns 0, we can proceed the comparison procedure.
+Now, when *cmpValues* returns 0, we can proceed with the comparison procedure.
 Otherwise, if we get (-1 or 1), we need to pass this result to the top level,
 and finish comparison procedure.
 
@@ -549,7 +549,7 @@ losslessly bitcasted to each other. The further explanation is modification of
    2.1.3.1. If types are vectors, compare their bitwidth using the
    *cmpNumbers*. If result is not 0, return it.
 
-   2.1.3.2. Different types, but not a vectors:
+   2.1.3.2. Different types, but not vectors:
 
    * if both of them are pointers, good for us, we can proceed to step 3.
    * if one of types is pointer, return result of *isPointer* flags
@@ -654,7 +654,7 @@ O(N*N) to O(log(N)).
 
 Merging process, mergeTwoFunctions
 ==================================
-Once *MergeFunctions* detected that current function (*G*) is equal to one that
+Once *MergeFunctions* detects that current function (*G*) is equal to one that
 were analyzed before (function *F*) it calls ``mergeTwoFunctions(Function*,
 Function*)``.
 
@@ -664,7 +664,7 @@ Operation affects ``FnTree`` contents with next way: *F* will stay in
 functions that calls *G* would be put into ``Deferred`` set and removed from
 ``FnTree``, and analyzed again.
 
-The approach is next:
+The approach is as follows:
 
 1. Most wished case: when we can use alias and both of *F* and *G* are weak. We
 make both of them with aliases to the third strong function *H*. Actually *H*
@@ -691,12 +691,12 @@ ok: we can use alias to *F* instead of *G* or change call instructions itself.
 
 HasGlobalAliases, removeUsers
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-First consider the case when we have global aliases of one function name to
+First, consider the case when we have global aliases of one function name to
 another. Our purpose is  make both of them with aliases to the third strong
 function. Though if we keep *F* alive and without major changes we can leave it
 in ``FnTree``. Try to combine these two goals.
 
-Do stub replacement of *F* itself with an alias to *F*.
+Do a stub replacement of *F* itself with an alias to *F*.
 
 1. Create stub function *H*, with the same name and attributes like function
 *F*. It takes maximum alignment of *F* and *G*.
@@ -725,7 +725,7 @@ also have alias to *F*.
 
 No global aliases, replaceDirectCallers
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-If global aliases are not supported. We call ``replaceDirectCallers``. Just
+If global aliases are not supported, we call ``replaceDirectCallers``. Just
 go through all calls of *G* and replace it with calls of *F*. If you look into
 the method you will see that it scans all uses of *G* too, and if use is callee
 (if user is call instruction and *G* is used as what to be called), we replace
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index d28eb68..2dc8f9f 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -971,6 +971,10 @@ Syntax:
   declare void  @llvm.nvvm.prefetch.L1(ptr %ptr)
   declare void  @llvm.nvvm.prefetch.L2(ptr %ptr)
   
+  declare void  @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr)
+  declare void  @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
+  declare void  @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)  
+  
   declare void  @llvm.nvvm.prefetch.global.L2.evict.normal(ptr addrspace(1) %global_ptr)
   declare void  @llvm.nvvm.prefetch.global.L2.evict.last(ptr addrspace(1) %global_ptr)
 
@@ -983,7 +987,10 @@ The '``@llvm.nvvm.prefetch.*``' and '``@llvm.nvvm.prefetchu.*``' intrinsic
 correspond to the '``prefetch.*``;' and '``prefetchu.*``' family of PTX instructions. 
 The '``prefetch.*``' instructions bring the cache line containing the
 specified address in global or local memory address space into the 
-specified cache level (L1 or L2). The '`prefetchu.*``' instruction brings the cache line 
+specified cache level (L1 or L2). If the '``.tensormap``' qualifier is specified then the 
+prefetch instruction brings the cache line containing the specified address in the 
+'``.const``' or '``.param memory``' state space for subsequent use by the '``cp.async.bulk.tensor``' 
+instruction. The '`prefetchu.*``' instruction brings the cache line 
 containing the specified generic address into the specified uniform cache level.
 If no address space is specified, it is assumed to be generic address. The intrinsic 
 uses and eviction priority which can be accessed by the '``.level::eviction_priority``' modifier.
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 0c49fc8..b38ed62 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -73,6 +73,7 @@ Changes to Vectorizers
 
 * Added initial support for copyable elements in SLP, which models copyable
   elements as add <element>, 0, i.e. uses identity constants for missing lanes.
+* SLP vectorizer supports initial recognition of FMA/FMAD pattern
 
 Changes to the AArch64 Backend
 ------------------------------
@@ -104,6 +105,10 @@ Changes to the PowerPC Backend
 Changes to the RISC-V Backend
 -----------------------------
 
+* `llvm-objdump` now has basic support for switching between disassembling code
+  and data using mapping symbols such as `$x` and `$d`. Switching architectures
+  using `$x` with an architecture string suffix is not yet supported.
+
 Changes to the WebAssembly Backend
 ----------------------------------
 
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index ea5eac4..1f27213 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -454,28 +454,28 @@ protected:
     return NextPowerOf2(NumEntries * 4 / 3 + 1);
   }
 
-  void moveFromOldBuckets(BucketT *OldBucketsBegin, BucketT *OldBucketsEnd) {
+  void moveFromOldBuckets(iterator_range<BucketT *> OldBuckets) {
     initEmpty();
 
     // Insert all the old elements.
     const KeyT EmptyKey = getEmptyKey();
     const KeyT TombstoneKey = getTombstoneKey();
-    for (BucketT *B = OldBucketsBegin, *E = OldBucketsEnd; B != E; ++B) {
-      if (!KeyInfoT::isEqual(B->getFirst(), EmptyKey) &&
-          !KeyInfoT::isEqual(B->getFirst(), TombstoneKey)) {
+    for (BucketT &B : OldBuckets) {
+      if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) &&
+          !KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) {
         // Insert the key/value into the new table.
         BucketT *DestBucket;
-        bool FoundVal = LookupBucketFor(B->getFirst(), DestBucket);
+        bool FoundVal = LookupBucketFor(B.getFirst(), DestBucket);
         (void)FoundVal; // silence warning.
         assert(!FoundVal && "Key already in new map?");
-        DestBucket->getFirst() = std::move(B->getFirst());
-        ::new (&DestBucket->getSecond()) ValueT(std::move(B->getSecond()));
+        DestBucket->getFirst() = std::move(B.getFirst());
+        ::new (&DestBucket->getSecond()) ValueT(std::move(B.getSecond()));
         incrementNumEntries();
 
         // Free the value.
-        B->getSecond().~ValueT();
+        B.getSecond().~ValueT();
       }
-      B->getFirst().~KeyT();
+      B.getFirst().~KeyT();
     }
   }
 
@@ -867,7 +867,8 @@ public:
       return;
     }
 
-    this->moveFromOldBuckets(OldBuckets, OldBuckets + OldNumBuckets);
+    this->moveFromOldBuckets(
+        llvm::make_range(OldBuckets, OldBuckets + OldNumBuckets));
 
     // Free the old table.
     deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets,
@@ -952,6 +953,9 @@ class SmallDenseMap
   struct LargeRep {
     BucketT *Buckets;
     unsigned NumBuckets;
+    iterator_range<BucketT *> buckets() {
+      return llvm::make_range(Buckets, Buckets + NumBuckets);
+    }
   };
 
   /// A "union" of an inline bucket array and the struct representing
@@ -1129,7 +1133,7 @@ public:
         Small = false;
         new (getLargeRep()) LargeRep(allocateBuckets(AtLeast));
       }
-      this->moveFromOldBuckets(TmpBegin, TmpEnd);
+      this->moveFromOldBuckets(llvm::make_range(TmpBegin, TmpEnd));
       return;
     }
 
@@ -1141,8 +1145,7 @@ public:
       new (getLargeRep()) LargeRep(allocateBuckets(AtLeast));
     }
 
-    this->moveFromOldBuckets(OldRep.Buckets,
-                             OldRep.Buckets + OldRep.NumBuckets);
+    this->moveFromOldBuckets(OldRep.buckets());
 
     // Free the old table.
     deallocate_buffer(OldRep.Buckets, sizeof(BucketT) * OldRep.NumBuckets,
diff --git a/llvm/include/llvm/ADT/SmallPtrSet.h b/llvm/include/llvm/ADT/SmallPtrSet.h
index 892040e..d49ef1d 100644
--- a/llvm/include/llvm/ADT/SmallPtrSet.h
+++ b/llvm/include/llvm/ADT/SmallPtrSet.h
@@ -46,7 +46,7 @@ namespace llvm {
 /// sets are often small.  In this case, no memory allocation is used, and only
 /// light-weight and cache-efficient scanning is used.
 ///
-/// Large sets use a classic exponentially-probed hash table.  Empty buckets are
+/// Large sets use a classic quadratically-probed hash table.  Empty buckets are
 /// represented with an illegal pointer value (-1) to allow null pointers to be
 /// inserted.  Tombstones are represented with another illegal pointer value
 /// (-2), to allow deletion.  The hash table is resized when the table is 3/4 or
diff --git a/llvm/include/llvm/ADT/StringRef.h b/llvm/include/llvm/ADT/StringRef.h
index 0ced1c0..16aca4d 100644
--- a/llvm/include/llvm/ADT/StringRef.h
+++ b/llvm/include/llvm/ADT/StringRef.h
@@ -38,6 +38,8 @@ namespace llvm {
   LLVM_ABI bool getAsSignedInteger(StringRef Str, unsigned Radix,
                                    long long &Result);
 
+  LLVM_ABI unsigned getAutoSenseRadix(StringRef &Str);
+
   LLVM_ABI bool consumeUnsignedInteger(StringRef &Str, unsigned Radix,
                                        unsigned long long &Result);
   LLVM_ABI bool consumeSignedInteger(StringRef &Str, unsigned Radix,
diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h
index 9d07bc0..88ac0a1 100644
--- a/llvm/include/llvm/Analysis/DXILResource.h
+++ b/llvm/include/llvm/Analysis/DXILResource.h
@@ -360,9 +360,11 @@ public:
              std::tie(RHS.RecordID, RHS.Space, RHS.LowerBound, RHS.Size);
     }
     bool overlapsWith(const ResourceBinding &RHS) const {
+      if (Space != RHS.Space)
+        return false;
       if (Size == UINT32_MAX)
         return LowerBound < RHS.LowerBound;
-      return Space == RHS.Space && LowerBound + Size - 1 >= RHS.LowerBound;
+      return LowerBound + Size - 1 >= RHS.LowerBound;
     }
   };
 
diff --git a/llvm/include/llvm/Analysis/DependenceAnalysis.h b/llvm/include/llvm/Analysis/DependenceAnalysis.h
index f98bd68..1679596 100644
--- a/llvm/include/llvm/Analysis/DependenceAnalysis.h
+++ b/llvm/include/llvm/Analysis/DependenceAnalysis.h
@@ -47,994 +47,908 @@
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
-  class AAResults;
-  template <typename T> class ArrayRef;
-  class Loop;
-  class LoopInfo;
-  class SCEVConstant;
-  class raw_ostream;
-
-  /// Dependence - This class represents a dependence between two memory
-  /// memory references in a function. It contains minimal information and
-  /// is used in the very common situation where the compiler is unable to
-  /// determine anything beyond the existence of a dependence; that is, it
-  /// represents a confused dependence (see also FullDependence). In most
-  /// cases (for output, flow, and anti dependences), the dependence implies
-  /// an ordering, where the source must precede the destination; in contrast,
-  /// input dependences are unordered.
-  ///
-  /// When a dependence graph is built, each Dependence will be a member of
-  /// the set of predecessor edges for its destination instruction and a set
-  /// if successor edges for its source instruction. These sets are represented
-  /// as singly-linked lists, with the "next" fields stored in the dependence
-  /// itelf.
-  class LLVM_ABI Dependence {
-  protected:
-    Dependence(Dependence &&) = default;
-    Dependence &operator=(Dependence &&) = default;
-
-  public:
-    Dependence(Instruction *Source, Instruction *Destination,
-               const SCEVUnionPredicate &A)
-        : Src(Source), Dst(Destination), Assumptions(A) {}
-    virtual ~Dependence() = default;
-
-    /// Dependence::DVEntry - Each level in the distance/direction vector
-    /// has a direction (or perhaps a union of several directions), and
-    /// perhaps a distance.
-    struct DVEntry {
-      enum : unsigned char {
-        NONE = 0,
-        LT = 1,
-        EQ = 2,
-        LE = 3,
-        GT = 4,
-        NE = 5,
-        GE = 6,
-        ALL = 7
-      };
-      unsigned char Direction : 3; // Init to ALL, then refine.
-      bool Scalar    : 1; // Init to true.
-      bool PeelFirst : 1; // Peeling the first iteration will break dependence.
-      bool PeelLast  : 1; // Peeling the last iteration will break the dependence.
-      bool Splitable : 1; // Splitting the loop will break dependence.
-      const SCEV *Distance = nullptr; // NULL implies no distance available.
-      DVEntry()
-          : Direction(ALL), Scalar(true), PeelFirst(false), PeelLast(false),
-            Splitable(false) {}
+class AAResults;
+template <typename T> class ArrayRef;
+class Loop;
+class LoopInfo;
+class SCEVConstant;
+class raw_ostream;
+
+/// Dependence - This class represents a dependence between two memory
+/// memory references in a function. It contains minimal information and
+/// is used in the very common situation where the compiler is unable to
+/// determine anything beyond the existence of a dependence; that is, it
+/// represents a confused dependence (see also FullDependence). In most
+/// cases (for output, flow, and anti dependences), the dependence implies
+/// an ordering, where the source must precede the destination; in contrast,
+/// input dependences are unordered.
+///
+/// When a dependence graph is built, each Dependence will be a member of
+/// the set of predecessor edges for its destination instruction and a set
+/// if successor edges for its source instruction. These sets are represented
+/// as singly-linked lists, with the "next" fields stored in the dependence
+/// itelf.
+class LLVM_ABI Dependence {
+protected:
+  Dependence(Dependence &&) = default;
+  Dependence &operator=(Dependence &&) = default;
+
+public:
+  Dependence(Instruction *Source, Instruction *Destination,
+             const SCEVUnionPredicate &A)
+      : Src(Source), Dst(Destination), Assumptions(A) {}
+  virtual ~Dependence() = default;
+
+  /// Dependence::DVEntry - Each level in the distance/direction vector
+  /// has a direction (or perhaps a union of several directions), and
+  /// perhaps a distance.
+  struct DVEntry {
+    enum : unsigned char {
+      NONE = 0,
+      LT = 1,
+      EQ = 2,
+      LE = 3,
+      GT = 4,
+      NE = 5,
+      GE = 6,
+      ALL = 7
     };
+    unsigned char Direction : 3; // Init to ALL, then refine.
+    bool Scalar : 1;             // Init to true.
+    bool PeelFirst : 1; // Peeling the first iteration will break dependence.
+    bool PeelLast : 1;  // Peeling the last iteration will break the dependence.
+    bool Splitable : 1; // Splitting the loop will break dependence.
+    const SCEV *Distance = nullptr; // NULL implies no distance available.
+    DVEntry()
+        : Direction(ALL), Scalar(true), PeelFirst(false), PeelLast(false),
+          Splitable(false) {}
+  };
 
-    /// getSrc - Returns the source instruction for this dependence.
-    ///
-    Instruction *getSrc() const { return Src; }
-
-    /// getDst - Returns the destination instruction for this dependence.
-    ///
-    Instruction *getDst() const { return Dst; }
-
-    /// isInput - Returns true if this is an input dependence.
-    ///
-    bool isInput() const;
-
-    /// isOutput - Returns true if this is an output dependence.
-    ///
-    bool isOutput() const;
-
-    /// isFlow - Returns true if this is a flow (aka true) dependence.
-    ///
-    bool isFlow() const;
-
-    /// isAnti - Returns true if this is an anti dependence.
-    ///
-    bool isAnti() const;
-
-    /// isOrdered - Returns true if dependence is Output, Flow, or Anti
-    ///
-    bool isOrdered() const { return isOutput() || isFlow() || isAnti(); }
+  /// getSrc - Returns the source instruction for this dependence.
+  Instruction *getSrc() const { return Src; }
 
-    /// isUnordered - Returns true if dependence is Input
-    ///
-    bool isUnordered() const { return isInput(); }
+  /// getDst - Returns the destination instruction for this dependence.
+  Instruction *getDst() const { return Dst; }
 
-    /// isLoopIndependent - Returns true if this is a loop-independent
-    /// dependence.
-    virtual bool isLoopIndependent() const { return true; }
+  /// isInput - Returns true if this is an input dependence.
+  bool isInput() const;
 
-    /// isConfused - Returns true if this dependence is confused
-    /// (the compiler understands nothing and makes worst-case
-    /// assumptions).
-    virtual bool isConfused() const { return true; }
+  /// isOutput - Returns true if this is an output dependence.
+  bool isOutput() const;
 
-    /// isConsistent - Returns true if this dependence is consistent
-    /// (occurs every time the source and destination are executed).
-    virtual bool isConsistent() const { return false; }
+  /// isFlow - Returns true if this is a flow (aka true) dependence.
+  bool isFlow() const;
 
-    /// getLevels - Returns the number of common loops surrounding the
-    /// source and destination of the dependence.
-    virtual unsigned getLevels() const { return 0; }
+  /// isAnti - Returns true if this is an anti dependence.
+  bool isAnti() const;
 
-    /// getDirection - Returns the direction associated with a particular
-    /// level.
-    virtual unsigned getDirection(unsigned Level) const { return DVEntry::ALL; }
+  /// isOrdered - Returns true if dependence is Output, Flow, or Anti
+  bool isOrdered() const { return isOutput() || isFlow() || isAnti(); }
 
-    /// getDistance - Returns the distance (or NULL) associated with a
-    /// particular level.
-    virtual const SCEV *getDistance(unsigned Level) const { return nullptr; }
+  /// isUnordered - Returns true if dependence is Input
+  bool isUnordered() const { return isInput(); }
 
-    /// Check if the direction vector is negative. A negative direction
-    /// vector means Src and Dst are reversed in the actual program.
-    virtual bool isDirectionNegative() const { return false; }
+  /// isLoopIndependent - Returns true if this is a loop-independent
+  /// dependence.
+  virtual bool isLoopIndependent() const { return true; }
 
-    /// If the direction vector is negative, normalize the direction
-    /// vector to make it non-negative. Normalization is done by reversing
-    /// Src and Dst, plus reversing the dependence directions and distances
-    /// in the vector.
-    virtual bool normalize(ScalarEvolution *SE) { return false; }
+  /// isConfused - Returns true if this dependence is confused
+  /// (the compiler understands nothing and makes worst-case assumptions).
+  virtual bool isConfused() const { return true; }
 
-    /// isPeelFirst - Returns true if peeling the first iteration from
-    /// this loop will break this dependence.
-    virtual bool isPeelFirst(unsigned Level) const { return false; }
+  /// isConsistent - Returns true if this dependence is consistent
+  /// (occurs every time the source and destination are executed).
+  virtual bool isConsistent() const { return false; }
 
-    /// isPeelLast - Returns true if peeling the last iteration from
-    /// this loop will break this dependence.
-    virtual bool isPeelLast(unsigned Level) const { return false; }
+  /// getLevels - Returns the number of common loops surrounding the
+  /// source and destination of the dependence.
+  virtual unsigned getLevels() const { return 0; }
 
-    /// isSplitable - Returns true if splitting this loop will break
-    /// the dependence.
-    virtual bool isSplitable(unsigned Level) const { return false; }
+  /// getDirection - Returns the direction associated with a particular level.
+  virtual unsigned getDirection(unsigned Level) const { return DVEntry::ALL; }
 
-    /// isScalar - Returns true if a particular level is scalar; that is,
-    /// if no subscript in the source or destination mention the induction
-    /// variable associated with the loop at this level.
-    virtual bool isScalar(unsigned Level) const;
+  /// getDistance - Returns the distance (or NULL) associated with a particular
+  /// level.
+  virtual const SCEV *getDistance(unsigned Level) const { return nullptr; }
 
-    /// getNextPredecessor - Returns the value of the NextPredecessor
-    /// field.
-    const Dependence *getNextPredecessor() const { return NextPredecessor; }
+  /// Check if the direction vector is negative. A negative direction
+  /// vector means Src and Dst are reversed in the actual program.
+  virtual bool isDirectionNegative() const { return false; }
+
+  /// If the direction vector is negative, normalize the direction
+  /// vector to make it non-negative. Normalization is done by reversing
+  /// Src and Dst, plus reversing the dependence directions and distances
+  /// in the vector.
+  virtual bool normalize(ScalarEvolution *SE) { return false; }
 
-    /// getNextSuccessor - Returns the value of the NextSuccessor
-    /// field.
-    const Dependence *getNextSuccessor() const { return NextSuccessor; }
+  /// isPeelFirst - Returns true if peeling the first iteration from
+  /// this loop will break this dependence.
+  virtual bool isPeelFirst(unsigned Level) const { return false; }
 
-    /// setNextPredecessor - Sets the value of the NextPredecessor
-    /// field.
-    void setNextPredecessor(const Dependence *pred) { NextPredecessor = pred; }
+  /// isPeelLast - Returns true if peeling the last iteration from
+  /// this loop will break this dependence.
+  virtual bool isPeelLast(unsigned Level) const { return false; }
 
-    /// setNextSuccessor - Sets the value of the NextSuccessor
-    /// field.
-    void setNextSuccessor(const Dependence *succ) { NextSuccessor = succ; }
+  /// isSplitable - Returns true if splitting this loop will break the
+  /// dependence.
+  virtual bool isSplitable(unsigned Level) const { return false; }
 
-    /// getRuntimeAssumptions - Returns the runtime assumptions under which this
-    /// Dependence relation is valid.
-    SCEVUnionPredicate getRuntimeAssumptions() const { return Assumptions; }
+  /// isScalar - Returns true if a particular level is scalar; that is,
+  /// if no subscript in the source or destination mention the induction
+  /// variable associated with the loop at this level.
+  virtual bool isScalar(unsigned Level) const;
+
+  /// getNextPredecessor - Returns the value of the NextPredecessor field.
+  const Dependence *getNextPredecessor() const { return NextPredecessor; }
+
+  /// getNextSuccessor - Returns the value of the NextSuccessor field.
+  const Dependence *getNextSuccessor() const { return NextSuccessor; }
+
+  /// setNextPredecessor - Sets the value of the NextPredecessor
+  /// field.
+  void setNextPredecessor(const Dependence *pred) { NextPredecessor = pred; }
+
+  /// setNextSuccessor - Sets the value of the NextSuccessor field.
+  void setNextSuccessor(const Dependence *succ) { NextSuccessor = succ; }
+
+  /// getRuntimeAssumptions - Returns the runtime assumptions under which this
+  /// Dependence relation is valid.
+  SCEVUnionPredicate getRuntimeAssumptions() const { return Assumptions; }
+
+  /// dump - For debugging purposes, dumps a dependence to OS.
+  void dump(raw_ostream &OS) const;
+
+protected:
+  Instruction *Src, *Dst;
+
+private:
+  SCEVUnionPredicate Assumptions;
+  const Dependence *NextPredecessor = nullptr, *NextSuccessor = nullptr;
+  friend class DependenceInfo;
+};
+
+/// FullDependence - This class represents a dependence between two memory
+/// references in a function. It contains detailed information about the
+/// dependence (direction vectors, etc.) and is used when the compiler is
+/// able to accurately analyze the interaction of the references; that is,
+/// it is not a confused dependence (see Dependence). In most cases
+/// (for output, flow, and anti dependences), the dependence implies an
+/// ordering, where the source must precede the destination; in contrast,
+/// input dependences are unordered.
+class LLVM_ABI FullDependence final : public Dependence {
+public:
+  FullDependence(Instruction *Source, Instruction *Destination,
+                 const SCEVUnionPredicate &Assumes,
+                 bool PossiblyLoopIndependent, unsigned Levels);
+
+  /// isLoopIndependent - Returns true if this is a loop-independent
+  /// dependence.
+  bool isLoopIndependent() const override { return LoopIndependent; }
+
+  /// isConfused - Returns true if this dependence is confused
+  /// (the compiler understands nothing and makes worst-case
+  /// assumptions).
+  bool isConfused() const override { return false; }
+
+  /// isConsistent - Returns true if this dependence is consistent
+  /// (occurs every time the source and destination are executed).
+  bool isConsistent() const override { return Consistent; }
+
+  /// getLevels - Returns the number of common loops surrounding the
+  /// source and destination of the dependence.
+  unsigned getLevels() const override { return Levels; }
+
+  /// getDirection - Returns the direction associated with a particular
+  /// level.
+  unsigned getDirection(unsigned Level) const override;
+
+  /// getDistance - Returns the distance (or NULL) associated with a
+  /// particular level.
+  const SCEV *getDistance(unsigned Level) const override;
+
+  /// Check if the direction vector is negative. A negative direction
+  /// vector means Src and Dst are reversed in the actual program.
+  bool isDirectionNegative() const override;
+
+  /// If the direction vector is negative, normalize the direction
+  /// vector to make it non-negative. Normalization is done by reversing
+  /// Src and Dst, plus reversing the dependence directions and distances
+  /// in the vector.
+  bool normalize(ScalarEvolution *SE) override;
+
+  /// isPeelFirst - Returns true if peeling the first iteration from
+  /// this loop will break this dependence.
+  bool isPeelFirst(unsigned Level) const override;
+
+  /// isPeelLast - Returns true if peeling the last iteration from
+  /// this loop will break this dependence.
+  bool isPeelLast(unsigned Level) const override;
+
+  /// isSplitable - Returns true if splitting the loop will break
+  /// the dependence.
+  bool isSplitable(unsigned Level) const override;
+
+  /// isScalar - Returns true if a particular level is scalar; that is,
+  /// if no subscript in the source or destination mention the induction
+  /// variable associated with the loop at this level.
+  bool isScalar(unsigned Level) const override;
+
+private:
+  unsigned short Levels;
+  bool LoopIndependent;
+  bool Consistent; // Init to true, then refine.
+  std::unique_ptr<DVEntry[]> DV;
+  friend class DependenceInfo;
+};
+
+/// DependenceInfo - This class is the main dependence-analysis driver.
+class DependenceInfo {
+public:
+  DependenceInfo(Function *F, AAResults *AA, ScalarEvolution *SE, LoopInfo *LI)
+      : AA(AA), SE(SE), LI(LI), F(F) {}
+
+  /// Handle transitive invalidation when the cached analysis results go away.
+  LLVM_ABI bool invalidate(Function &F, const PreservedAnalyses &PA,
+                           FunctionAnalysisManager::Invalidator &Inv);
+
+  /// depends - Tests for a dependence between the Src and Dst instructions.
+  /// Returns NULL if no dependence; otherwise, returns a Dependence (or a
+  /// FullDependence) with as much information as can be gleaned. By default,
+  /// the dependence test collects a set of runtime assumptions that cannot be
+  /// solved at compilation time. By default UnderRuntimeAssumptions is false
+  /// for a safe approximation of the dependence relation that does not
+  /// require runtime checks.
+  LLVM_ABI std::unique_ptr<Dependence>
+  depends(Instruction *Src, Instruction *Dst,
+          bool UnderRuntimeAssumptions = false);
+
+  /// getSplitIteration - Give a dependence that's splittable at some
+  /// particular level, return the iteration that should be used to split
+  /// the loop.
+  ///
+  /// Generally, the dependence analyzer will be used to build
+  /// a dependence graph for a function (basically a map from instructions
+  /// to dependences). Looking for cycles in the graph shows us loops
+  /// that cannot be trivially vectorized/parallelized.
+  ///
+  /// We can try to improve the situation by examining all the dependences
+  /// that make up the cycle, looking for ones we can break.
+  /// Sometimes, peeling the first or last iteration of a loop will break
+  /// dependences, and there are flags for those possibilities.
+  /// Sometimes, splitting a loop at some other iteration will do the trick,
+  /// and we've got a flag for that case. Rather than waste the space to
+  /// record the exact iteration (since we rarely know), we provide
+  /// a method that calculates the iteration. It's a drag that it must work
+  /// from scratch, but wonderful in that it's possible.
+  ///
+  /// Here's an example:
+  ///
+  ///    for (i = 0; i < 10; i++)
+  ///        A[i] = ...
+  ///        ... = A[11 - i]
+  ///
+  /// There's a loop-carried flow dependence from the store to the load,
+  /// found by the weak-crossing SIV test. The dependence will have a flag,
+  /// indicating that the dependence can be broken by splitting the loop.
+  /// Calling getSplitIteration will return 5.
+  /// Splitting the loop breaks the dependence, like so:
+  ///
+  ///    for (i = 0; i <= 5; i++)
+  ///        A[i] = ...
+  ///        ... = A[11 - i]
+  ///    for (i = 6; i < 10; i++)
+  ///        A[i] = ...
+  ///        ... = A[11 - i]
+  ///
+  /// breaks the dependence and allows us to vectorize/parallelize
+  /// both loops.
+  LLVM_ABI const SCEV *getSplitIteration(const Dependence &Dep, unsigned Level);
+
+  Function *getFunction() const { return F; }
+
+  /// getRuntimeAssumptions - Returns all the runtime assumptions under which
+  /// the dependence test is valid.
+  LLVM_ABI SCEVUnionPredicate getRuntimeAssumptions() const;
+
+private:
+  AAResults *AA;
+  ScalarEvolution *SE;
+  LoopInfo *LI;
+  Function *F;
+  SmallVector<const SCEVPredicate *, 4> Assumptions;
+
+  /// Subscript - This private struct represents a pair of subscripts from
+  /// a pair of potentially multi-dimensional array references. We use a
+  /// vector of them to guide subscript partitioning.
+  struct Subscript {
+    const SCEV *Src;
+    const SCEV *Dst;
+    enum ClassificationKind { ZIV, SIV, RDIV, MIV, NonLinear } Classification;
+    SmallBitVector Loops;
+    SmallBitVector GroupLoops;
+    SmallBitVector Group;
+  };
 
-    /// dump - For debugging purposes, dumps a dependence to OS.
-    ///
-    void dump(raw_ostream &OS) const;
+  struct CoefficientInfo {
+    const SCEV *Coeff;
+    const SCEV *PosPart;
+    const SCEV *NegPart;
+    const SCEV *Iterations;
+  };
 
-  protected:
-    Instruction *Src, *Dst;
+  struct BoundInfo {
+    const SCEV *Iterations;
+    const SCEV *Upper[8];
+    const SCEV *Lower[8];
+    unsigned char Direction;
+    unsigned char DirSet;
+  };
 
+  /// Constraint - This private class represents a constraint, as defined
+  /// in the paper
+  ///
+  ///           Practical Dependence Testing
+  ///           Goff, Kennedy, Tseng
+  ///           PLDI 1991
+  ///
+  /// There are 5 kinds of constraint, in a hierarchy.
+  ///   1) Any - indicates no constraint, any dependence is possible.
+  ///   2) Line - A line ax + by = c, where a, b, and c are parameters,
+  ///             representing the dependence equation.
+  ///   3) Distance - The value d of the dependence distance;
+  ///   4) Point - A point <x, y> representing the dependence from
+  ///              iteration x to iteration y.
+  ///   5) Empty - No dependence is possible.
+  class Constraint {
   private:
-    SCEVUnionPredicate Assumptions;
-    const Dependence *NextPredecessor = nullptr, *NextSuccessor = nullptr;
-    friend class DependenceInfo;
-  };
+    enum ConstraintKind { Empty, Point, Distance, Line, Any } Kind;
+    ScalarEvolution *SE;
+    const SCEV *A;
+    const SCEV *B;
+    const SCEV *C;
+    const Loop *AssociatedLoop;
 
-  /// FullDependence - This class represents a dependence between two memory
-  /// references in a function. It contains detailed information about the
-  /// dependence (direction vectors, etc.) and is used when the compiler is
-  /// able to accurately analyze the interaction of the references; that is,
-  /// it is not a confused dependence (see Dependence). In most cases
-  /// (for output, flow, and anti dependences), the dependence implies an
-  /// ordering, where the source must precede the destination; in contrast,
-  /// input dependences are unordered.
-  class LLVM_ABI FullDependence final : public Dependence {
   public:
-    FullDependence(Instruction *Source, Instruction *Destination,
-                   const SCEVUnionPredicate &Assumes,
-                   bool PossiblyLoopIndependent, unsigned Levels);
-
-    /// isLoopIndependent - Returns true if this is a loop-independent
-    /// dependence.
-    bool isLoopIndependent() const override { return LoopIndependent; }
-
-    /// isConfused - Returns true if this dependence is confused
-    /// (the compiler understands nothing and makes worst-case
-    /// assumptions).
-    bool isConfused() const override { return false; }
-
-    /// isConsistent - Returns true if this dependence is consistent
-    /// (occurs every time the source and destination are executed).
-    bool isConsistent() const override { return Consistent; }
-
-    /// getLevels - Returns the number of common loops surrounding the
-    /// source and destination of the dependence.
-    unsigned getLevels() const override { return Levels; }
-
-    /// getDirection - Returns the direction associated with a particular
-    /// level.
-    unsigned getDirection(unsigned Level) const override;
-
-    /// getDistance - Returns the distance (or NULL) associated with a
-    /// particular level.
-    const SCEV *getDistance(unsigned Level) const override;
-
-    /// Check if the direction vector is negative. A negative direction
-    /// vector means Src and Dst are reversed in the actual program.
-    bool isDirectionNegative() const override;
-
-    /// If the direction vector is negative, normalize the direction
-    /// vector to make it non-negative. Normalization is done by reversing
-    /// Src and Dst, plus reversing the dependence directions and distances
-    /// in the vector.
-    bool normalize(ScalarEvolution *SE) override;
-
-    /// isPeelFirst - Returns true if peeling the first iteration from
-    /// this loop will break this dependence.
-    bool isPeelFirst(unsigned Level) const override;
-
-    /// isPeelLast - Returns true if peeling the last iteration from
-    /// this loop will break this dependence.
-    bool isPeelLast(unsigned Level) const override;
-
-    /// isSplitable - Returns true if splitting the loop will break
-    /// the dependence.
-    bool isSplitable(unsigned Level) const override;
-
-    /// isScalar - Returns true if a particular level is scalar; that is,
-    /// if no subscript in the source or destination mention the induction
-    /// variable associated with the loop at this level.
-    bool isScalar(unsigned Level) const override;
+    /// isEmpty - Return true if the constraint is of kind Empty.
+    bool isEmpty() const { return Kind == Empty; }
 
-  private:
-    unsigned short Levels;
-    bool LoopIndependent;
-    bool Consistent; // Init to true, then refine.
-    std::unique_ptr<DVEntry[]> DV;
-    friend class DependenceInfo;
-  };
+    /// isPoint - Return true if the constraint is of kind Point.
+    bool isPoint() const { return Kind == Point; }
 
-  /// DependenceInfo - This class is the main dependence-analysis driver.
-  ///
-  class DependenceInfo {
-  public:
-    DependenceInfo(Function *F, AAResults *AA, ScalarEvolution *SE,
-                   LoopInfo *LI)
-        : AA(AA), SE(SE), LI(LI), F(F) {}
-
-    /// Handle transitive invalidation when the cached analysis results go away.
-    LLVM_ABI bool invalidate(Function &F, const PreservedAnalyses &PA,
-                             FunctionAnalysisManager::Invalidator &Inv);
-
-    /// depends - Tests for a dependence between the Src and Dst instructions.
-    /// Returns NULL if no dependence; otherwise, returns a Dependence (or a
-    /// FullDependence) with as much information as can be gleaned. By default,
-    /// the dependence test collects a set of runtime assumptions that cannot be
-    /// solved at compilation time. By default UnderRuntimeAssumptions is false
-    /// for a safe approximation of the dependence relation that does not
-    /// require runtime checks.
-    LLVM_ABI std::unique_ptr<Dependence>
-    depends(Instruction *Src, Instruction *Dst,
-            bool UnderRuntimeAssumptions = false);
-
-    /// getSplitIteration - Give a dependence that's splittable at some
-    /// particular level, return the iteration that should be used to split
-    /// the loop.
-    ///
-    /// Generally, the dependence analyzer will be used to build
-    /// a dependence graph for a function (basically a map from instructions
-    /// to dependences). Looking for cycles in the graph shows us loops
-    /// that cannot be trivially vectorized/parallelized.
-    ///
-    /// We can try to improve the situation by examining all the dependences
-    /// that make up the cycle, looking for ones we can break.
-    /// Sometimes, peeling the first or last iteration of a loop will break
-    /// dependences, and there are flags for those possibilities.
-    /// Sometimes, splitting a loop at some other iteration will do the trick,
-    /// and we've got a flag for that case. Rather than waste the space to
-    /// record the exact iteration (since we rarely know), we provide
-    /// a method that calculates the iteration. It's a drag that it must work
-    /// from scratch, but wonderful in that it's possible.
-    ///
-    /// Here's an example:
-    ///
-    ///    for (i = 0; i < 10; i++)
-    ///        A[i] = ...
-    ///        ... = A[11 - i]
-    ///
-    /// There's a loop-carried flow dependence from the store to the load,
-    /// found by the weak-crossing SIV test. The dependence will have a flag,
-    /// indicating that the dependence can be broken by splitting the loop.
-    /// Calling getSplitIteration will return 5.
-    /// Splitting the loop breaks the dependence, like so:
-    ///
-    ///    for (i = 0; i <= 5; i++)
-    ///        A[i] = ...
-    ///        ... = A[11 - i]
-    ///    for (i = 6; i < 10; i++)
-    ///        A[i] = ...
-    ///        ... = A[11 - i]
-    ///
-    /// breaks the dependence and allows us to vectorize/parallelize
-    /// both loops.
-    LLVM_ABI const SCEV *getSplitIteration(const Dependence &Dep,
-                                           unsigned Level);
-
-    Function *getFunction() const { return F; }
-
-    /// getRuntimeAssumptions - Returns all the runtime assumptions under which
-    /// the dependence test is valid.
-    LLVM_ABI SCEVUnionPredicate getRuntimeAssumptions() const;
+    /// isDistance - Return true if the constraint is of kind Distance.
+    bool isDistance() const { return Kind == Distance; }
 
-  private:
-    AAResults *AA;
-    ScalarEvolution *SE;
-    LoopInfo *LI;
-    Function *F;
-    SmallVector<const SCEVPredicate *, 4> Assumptions;
-
-    /// Subscript - This private struct represents a pair of subscripts from
-    /// a pair of potentially multi-dimensional array references. We use a
-    /// vector of them to guide subscript partitioning.
-    struct Subscript {
-      const SCEV *Src;
-      const SCEV *Dst;
-      enum ClassificationKind { ZIV, SIV, RDIV, MIV, NonLinear } Classification;
-      SmallBitVector Loops;
-      SmallBitVector GroupLoops;
-      SmallBitVector Group;
-    };
+    /// isLine - Return true if the constraint is of kind Line.
+    /// Since Distance's can also be represented as Lines, we also return
+    /// true if the constraint is of kind Distance.
+    bool isLine() const { return Kind == Line || Kind == Distance; }
 
-    struct CoefficientInfo {
-      const SCEV *Coeff;
-      const SCEV *PosPart;
-      const SCEV *NegPart;
-      const SCEV *Iterations;
-    };
+    /// isAny - Return true if the constraint is of kind Any;
+    bool isAny() const { return Kind == Any; }
 
-    struct BoundInfo {
-      const SCEV *Iterations;
-      const SCEV *Upper[8];
-      const SCEV *Lower[8];
-      unsigned char Direction;
-      unsigned char DirSet;
-    };
+    /// getX - If constraint is a point <X, Y>, returns X.
+    /// Otherwise assert.
+    LLVM_ABI const SCEV *getX() const;
 
-    /// Constraint - This private class represents a constraint, as defined
-    /// in the paper
-    ///
-    ///           Practical Dependence Testing
-    ///           Goff, Kennedy, Tseng
-    ///           PLDI 1991
-    ///
-    /// There are 5 kinds of constraint, in a hierarchy.
-    ///   1) Any - indicates no constraint, any dependence is possible.
-    ///   2) Line - A line ax + by = c, where a, b, and c are parameters,
-    ///             representing the dependence equation.
-    ///   3) Distance - The value d of the dependence distance;
-    ///   4) Point - A point <x, y> representing the dependence from
-    ///              iteration x to iteration y.
-    ///   5) Empty - No dependence is possible.
-    class Constraint {
-    private:
-      enum ConstraintKind { Empty, Point, Distance, Line, Any } Kind;
-      ScalarEvolution *SE;
-      const SCEV *A;
-      const SCEV *B;
-      const SCEV *C;
-      const Loop *AssociatedLoop;
-
-    public:
-      /// isEmpty - Return true if the constraint is of kind Empty.
-      bool isEmpty() const { return Kind == Empty; }
-
-      /// isPoint - Return true if the constraint is of kind Point.
-      bool isPoint() const { return Kind == Point; }
-
-      /// isDistance - Return true if the constraint is of kind Distance.
-      bool isDistance() const { return Kind == Distance; }
-
-      /// isLine - Return true if the constraint is of kind Line.
-      /// Since Distance's can also be represented as Lines, we also return
-      /// true if the constraint is of kind Distance.
-      bool isLine() const { return Kind == Line || Kind == Distance; }
-
-      /// isAny - Return true if the constraint is of kind Any;
-      bool isAny() const { return Kind == Any; }
-
-      /// getX - If constraint is a point <X, Y>, returns X.
-      /// Otherwise assert.
-      LLVM_ABI const SCEV *getX() const;
-
-      /// getY - If constraint is a point <X, Y>, returns Y.
-      /// Otherwise assert.
-      LLVM_ABI const SCEV *getY() const;
-
-      /// getA - If constraint is a line AX + BY = C, returns A.
-      /// Otherwise assert.
-      LLVM_ABI const SCEV *getA() const;
-
-      /// getB - If constraint is a line AX + BY = C, returns B.
-      /// Otherwise assert.
-      LLVM_ABI const SCEV *getB() const;
-
-      /// getC - If constraint is a line AX + BY = C, returns C.
-      /// Otherwise assert.
-      LLVM_ABI const SCEV *getC() const;
-
-      /// getD - If constraint is a distance, returns D.
-      /// Otherwise assert.
-      LLVM_ABI const SCEV *getD() const;
-
-      /// getAssociatedLoop - Returns the loop associated with this constraint.
-      LLVM_ABI const Loop *getAssociatedLoop() const;
-
-      /// setPoint - Change a constraint to Point.
-      LLVM_ABI void setPoint(const SCEV *X, const SCEV *Y,
-                             const Loop *CurrentLoop);
-
-      /// setLine - Change a constraint to Line.
-      LLVM_ABI void setLine(const SCEV *A, const SCEV *B, const SCEV *C,
-                            const Loop *CurrentLoop);
-
-      /// setDistance - Change a constraint to Distance.
-      LLVM_ABI void setDistance(const SCEV *D, const Loop *CurrentLoop);
-
-      /// setEmpty - Change a constraint to Empty.
-      LLVM_ABI void setEmpty();
-
-      /// setAny - Change a constraint to Any.
-      LLVM_ABI void setAny(ScalarEvolution *SE);
-
-      /// dump - For debugging purposes. Dumps the constraint
-      /// out to OS.
-      LLVM_ABI void dump(raw_ostream &OS) const;
-    };
+    /// getY - If constraint is a point <X, Y>, returns Y.
+    /// Otherwise assert.
+    LLVM_ABI const SCEV *getY() const;
 
-    /// establishNestingLevels - Examines the loop nesting of the Src and Dst
-    /// instructions and establishes their shared loops. Sets the variables
-    /// CommonLevels, SrcLevels, and MaxLevels.
-    /// The source and destination instructions needn't be contained in the same
-    /// loop. The routine establishNestingLevels finds the level of most deeply
-    /// nested loop that contains them both, CommonLevels. An instruction that's
-    /// not contained in a loop is at level = 0. MaxLevels is equal to the level
-    /// of the source plus the level of the destination, minus CommonLevels.
-    /// This lets us allocate vectors MaxLevels in length, with room for every
-    /// distinct loop referenced in both the source and destination subscripts.
-    /// The variable SrcLevels is the nesting depth of the source instruction.
-    /// It's used to help calculate distinct loops referenced by the destination.
-    /// Here's the map from loops to levels:
-    ///            0 - unused
-    ///            1 - outermost common loop
-    ///          ... - other common loops
-    /// CommonLevels - innermost common loop
-    ///          ... - loops containing Src but not Dst
-    ///    SrcLevels - innermost loop containing Src but not Dst
-    ///          ... - loops containing Dst but not Src
-    ///    MaxLevels - innermost loop containing Dst but not Src
-    /// Consider the follow code fragment:
-    ///    for (a = ...) {
-    ///      for (b = ...) {
-    ///        for (c = ...) {
-    ///          for (d = ...) {
-    ///            A[] = ...;
-    ///          }
-    ///        }
-    ///        for (e = ...) {
-    ///          for (f = ...) {
-    ///            for (g = ...) {
-    ///              ... = A[];
-    ///            }
-    ///          }
-    ///        }
-    ///      }
-    ///    }
-    /// If we're looking at the possibility of a dependence between the store
-    /// to A (the Src) and the load from A (the Dst), we'll note that they
-    /// have 2 loops in common, so CommonLevels will equal 2 and the direction
-    /// vector for Result will have 2 entries. SrcLevels = 4 and MaxLevels = 7.
-    /// A map from loop names to level indices would look like
-    ///     a - 1
-    ///     b - 2 = CommonLevels
-    ///     c - 3
-    ///     d - 4 = SrcLevels
-    ///     e - 5
-    ///     f - 6
-    ///     g - 7 = MaxLevels
-    void establishNestingLevels(const Instruction *Src,
-                                const Instruction *Dst);
-
-    unsigned CommonLevels, SrcLevels, MaxLevels;
-
-    /// mapSrcLoop - Given one of the loops containing the source, return
-    /// its level index in our numbering scheme.
-    unsigned mapSrcLoop(const Loop *SrcLoop) const;
-
-    /// mapDstLoop - Given one of the loops containing the destination,
-    /// return its level index in our numbering scheme.
-    unsigned mapDstLoop(const Loop *DstLoop) const;
-
-    /// isLoopInvariant - Returns true if Expression is loop invariant
-    /// in LoopNest.
-    bool isLoopInvariant(const SCEV *Expression, const Loop *LoopNest) const;
-
-    /// Makes sure all subscript pairs share the same integer type by
-    /// sign-extending as necessary.
-    /// Sign-extending a subscript is safe because getelementptr assumes the
-    /// array subscripts are signed.
-    void unifySubscriptType(ArrayRef<Subscript *> Pairs);
-
-    /// removeMatchingExtensions - Examines a subscript pair.
-    /// If the source and destination are identically sign (or zero)
-    /// extended, it strips off the extension in an effort to
-    /// simplify the actual analysis.
-    void removeMatchingExtensions(Subscript *Pair);
-
-    /// collectCommonLoops - Finds the set of loops from the LoopNest that
-    /// have a level <= CommonLevels and are referred to by the SCEV Expression.
-    void collectCommonLoops(const SCEV *Expression,
-                            const Loop *LoopNest,
-                            SmallBitVector &Loops) const;
-
-    /// checkSrcSubscript - Examines the SCEV Src, returning true iff it's
-    /// linear. Collect the set of loops mentioned by Src.
-    bool checkSrcSubscript(const SCEV *Src,
-                           const Loop *LoopNest,
-                           SmallBitVector &Loops);
-
-    /// checkDstSubscript - Examines the SCEV Dst, returning true iff it's
-    /// linear. Collect the set of loops mentioned by Dst.
-    bool checkDstSubscript(const SCEV *Dst,
-                           const Loop *LoopNest,
-                           SmallBitVector &Loops);
-
-    /// isKnownPredicate - Compare X and Y using the predicate Pred.
-    /// Basically a wrapper for SCEV::isKnownPredicate,
-    /// but tries harder, especially in the presence of sign and zero
-    /// extensions and symbolics.
-    bool isKnownPredicate(ICmpInst::Predicate Pred,
-                          const SCEV *X,
-                          const SCEV *Y) const;
-
-    /// isKnownLessThan - Compare to see if S is less than Size
-    /// Another wrapper for isKnownNegative(S - max(Size, 1)) with some extra
-    /// checking if S is an AddRec and we can prove lessthan using the loop
-    /// bounds.
-    bool isKnownLessThan(const SCEV *S, const SCEV *Size) const;
-
-    /// isKnownNonNegative - Compare to see if S is known not to be negative
-    /// Uses the fact that S comes from Ptr, which may be an inbound GEP,
-    /// Proving there is no wrapping going on.
-    bool isKnownNonNegative(const SCEV *S, const Value *Ptr) const;
-
-    /// collectUpperBound - All subscripts are the same type (on my machine,
-    /// an i64). The loop bound may be a smaller type. collectUpperBound
-    /// find the bound, if available, and zero extends it to the Type T.
-    /// (I zero extend since the bound should always be >= 0.)
-    /// If no upper bound is available, return NULL.
-    const SCEV *collectUpperBound(const Loop *l, Type *T) const;
-
-    /// collectConstantUpperBound - Calls collectUpperBound(), then
-    /// attempts to cast it to SCEVConstant. If the cast fails,
-    /// returns NULL.
-    const SCEVConstant *collectConstantUpperBound(const Loop *l, Type *T) const;
-
-    /// classifyPair - Examines the subscript pair (the Src and Dst SCEVs)
-    /// and classifies it as either ZIV, SIV, RDIV, MIV, or Nonlinear.
-    /// Collects the associated loops in a set.
-    Subscript::ClassificationKind classifyPair(const SCEV *Src,
-                                           const Loop *SrcLoopNest,
-                                           const SCEV *Dst,
-                                           const Loop *DstLoopNest,
-                                           SmallBitVector &Loops);
-
-    /// testZIV - Tests the ZIV subscript pair (Src and Dst) for dependence.
-    /// Returns true if any possible dependence is disproved.
-    /// If there might be a dependence, returns false.
-    /// If the dependence isn't proven to exist,
-    /// marks the Result as inconsistent.
-    bool testZIV(const SCEV *Src,
-                 const SCEV *Dst,
-                 FullDependence &Result) const;
-
-    /// testSIV - Tests the SIV subscript pair (Src and Dst) for dependence.
-    /// Things of the form [c1 + a1*i] and [c2 + a2*j], where
-    /// i and j are induction variables, c1 and c2 are loop invariant,
-    /// and a1 and a2 are constant.
-    /// Returns true if any possible dependence is disproved.
-    /// If there might be a dependence, returns false.
-    /// Sets appropriate direction vector entry and, when possible,
-    /// the distance vector entry.
-    /// If the dependence isn't proven to exist,
-    /// marks the Result as inconsistent.
-    bool testSIV(const SCEV *Src,
-                 const SCEV *Dst,
-                 unsigned &Level,
-                 FullDependence &Result,
-                 Constraint &NewConstraint,
-                 const SCEV *&SplitIter) const;
-
-    /// testRDIV - Tests the RDIV subscript pair (Src and Dst) for dependence.
-    /// Things of the form [c1 + a1*i] and [c2 + a2*j]
-    /// where i and j are induction variables, c1 and c2 are loop invariant,
-    /// and a1 and a2 are constant.
-    /// With minor algebra, this test can also be used for things like
-    /// [c1 + a1*i + a2*j][c2].
-    /// Returns true if any possible dependence is disproved.
-    /// If there might be a dependence, returns false.
-    /// Marks the Result as inconsistent.
-    bool testRDIV(const SCEV *Src,
-                  const SCEV *Dst,
-                  FullDependence &Result) const;
+    /// getA - If constraint is a line AX + BY = C, returns A.
+    /// Otherwise assert.
+    LLVM_ABI const SCEV *getA() const;
 
-    /// testMIV - Tests the MIV subscript pair (Src and Dst) for dependence.
-    /// Returns true if dependence disproved.
-    /// Can sometimes refine direction vectors.
-    bool testMIV(const SCEV *Src,
-                 const SCEV *Dst,
-                 const SmallBitVector &Loops,
-                 FullDependence &Result) const;
-
-    /// strongSIVtest - Tests the strong SIV subscript pair (Src and Dst)
-    /// for dependence.
-    /// Things of the form [c1 + a*i] and [c2 + a*i],
-    /// where i is an induction variable, c1 and c2 are loop invariant,
-    /// and a is a constant
-    /// Returns true if any possible dependence is disproved.
-    /// If there might be a dependence, returns false.
-    /// Sets appropriate direction and distance.
-    bool strongSIVtest(const SCEV *Coeff,
-                       const SCEV *SrcConst,
-                       const SCEV *DstConst,
-                       const Loop *CurrentLoop,
-                       unsigned Level,
-                       FullDependence &Result,
-                       Constraint &NewConstraint) const;
-
-    /// weakCrossingSIVtest - Tests the weak-crossing SIV subscript pair
-    /// (Src and Dst) for dependence.
-    /// Things of the form [c1 + a*i] and [c2 - a*i],
-    /// where i is an induction variable, c1 and c2 are loop invariant,
-    /// and a is a constant.
-    /// Returns true if any possible dependence is disproved.
-    /// If there might be a dependence, returns false.
-    /// Sets appropriate direction entry.
-    /// Set consistent to false.
-    /// Marks the dependence as splitable.
-    bool weakCrossingSIVtest(const SCEV *SrcCoeff,
-                             const SCEV *SrcConst,
-                             const SCEV *DstConst,
-                             const Loop *CurrentLoop,
-                             unsigned Level,
-                             FullDependence &Result,
-                             Constraint &NewConstraint,
-                             const SCEV *&SplitIter) const;
-
-    /// ExactSIVtest - Tests the SIV subscript pair
-    /// (Src and Dst) for dependence.
-    /// Things of the form [c1 + a1*i] and [c2 + a2*i],
-    /// where i is an induction variable, c1 and c2 are loop invariant,
-    /// and a1 and a2 are constant.
-    /// Returns true if any possible dependence is disproved.
-    /// If there might be a dependence, returns false.
-    /// Sets appropriate direction entry.
-    /// Set consistent to false.
-    bool exactSIVtest(const SCEV *SrcCoeff,
-                      const SCEV *DstCoeff,
-                      const SCEV *SrcConst,
-                      const SCEV *DstConst,
-                      const Loop *CurrentLoop,
-                      unsigned Level,
-                      FullDependence &Result,
-                      Constraint &NewConstraint) const;
-
-    /// weakZeroSrcSIVtest - Tests the weak-zero SIV subscript pair
-    /// (Src and Dst) for dependence.
-    /// Things of the form [c1] and [c2 + a*i],
-    /// where i is an induction variable, c1 and c2 are loop invariant,
-    /// and a is a constant. See also weakZeroDstSIVtest.
-    /// Returns true if any possible dependence is disproved.
-    /// If there might be a dependence, returns false.
-    /// Sets appropriate direction entry.
-    /// Set consistent to false.
-    /// If loop peeling will break the dependence, mark appropriately.
-    bool weakZeroSrcSIVtest(const SCEV *DstCoeff,
-                            const SCEV *SrcConst,
-                            const SCEV *DstConst,
-                            const Loop *CurrentLoop,
-                            unsigned Level,
-                            FullDependence &Result,
-                            Constraint &NewConstraint) const;
-
-    /// weakZeroDstSIVtest - Tests the weak-zero SIV subscript pair
-    /// (Src and Dst) for dependence.
-    /// Things of the form [c1 + a*i] and [c2],
-    /// where i is an induction variable, c1 and c2 are loop invariant,
-    /// and a is a constant. See also weakZeroSrcSIVtest.
-    /// Returns true if any possible dependence is disproved.
-    /// If there might be a dependence, returns false.
-    /// Sets appropriate direction entry.
-    /// Set consistent to false.
-    /// If loop peeling will break the dependence, mark appropriately.
-    bool weakZeroDstSIVtest(const SCEV *SrcCoeff,
-                            const SCEV *SrcConst,
-                            const SCEV *DstConst,
-                            const Loop *CurrentLoop,
-                            unsigned Level,
-                            FullDependence &Result,
-                            Constraint &NewConstraint) const;
-
-    /// exactRDIVtest - Tests the RDIV subscript pair for dependence.
-    /// Things of the form [c1 + a*i] and [c2 + b*j],
-    /// where i and j are induction variable, c1 and c2 are loop invariant,
-    /// and a and b are constants.
-    /// Returns true if any possible dependence is disproved.
-    /// Marks the result as inconsistent.
-    /// Works in some cases that symbolicRDIVtest doesn't,
-    /// and vice versa.
-    bool exactRDIVtest(const SCEV *SrcCoeff,
-                       const SCEV *DstCoeff,
-                       const SCEV *SrcConst,
-                       const SCEV *DstConst,
-                       const Loop *SrcLoop,
-                       const Loop *DstLoop,
-                       FullDependence &Result) const;
+    /// getB - If constraint is a line AX + BY = C, returns B.
+    /// Otherwise assert.
+    LLVM_ABI const SCEV *getB() const;
 
-    /// symbolicRDIVtest - Tests the RDIV subscript pair for dependence.
-    /// Things of the form [c1 + a*i] and [c2 + b*j],
-    /// where i and j are induction variable, c1 and c2 are loop invariant,
-    /// and a and b are constants.
-    /// Returns true if any possible dependence is disproved.
-    /// Marks the result as inconsistent.
-    /// Works in some cases that exactRDIVtest doesn't,
-    /// and vice versa. Can also be used as a backup for
-    /// ordinary SIV tests.
-    bool symbolicRDIVtest(const SCEV *SrcCoeff,
-                          const SCEV *DstCoeff,
-                          const SCEV *SrcConst,
-                          const SCEV *DstConst,
-                          const Loop *SrcLoop,
-                          const Loop *DstLoop) const;
-
-    /// gcdMIVtest - Tests an MIV subscript pair for dependence.
-    /// Returns true if any possible dependence is disproved.
-    /// Marks the result as inconsistent.
-    /// Can sometimes disprove the equal direction for 1 or more loops.
-    //  Can handle some symbolics that even the SIV tests don't get,
-    /// so we use it as a backup for everything.
-    bool gcdMIVtest(const SCEV *Src,
-                    const SCEV *Dst,
-                    FullDependence &Result) const;
-
-    /// banerjeeMIVtest - Tests an MIV subscript pair for dependence.
-    /// Returns true if any possible dependence is disproved.
-    /// Marks the result as inconsistent.
-    /// Computes directions.
-    bool banerjeeMIVtest(const SCEV *Src,
-                         const SCEV *Dst,
-                         const SmallBitVector &Loops,
-                         FullDependence &Result) const;
-
-    /// collectCoefficientInfo - Walks through the subscript,
-    /// collecting each coefficient, the associated loop bounds,
-    /// and recording its positive and negative parts for later use.
-    CoefficientInfo *collectCoeffInfo(const SCEV *Subscript,
-                                      bool SrcFlag,
-                                      const SCEV *&Constant) const;
-
-    /// getPositivePart - X^+ = max(X, 0).
-    ///
-    const SCEV *getPositivePart(const SCEV *X) const;
-
-    /// getNegativePart - X^- = min(X, 0).
-    ///
-    const SCEV *getNegativePart(const SCEV *X) const;
-
-    /// getLowerBound - Looks through all the bounds info and
-    /// computes the lower bound given the current direction settings
-    /// at each level.
-    const SCEV *getLowerBound(BoundInfo *Bound) const;
-
-    /// getUpperBound - Looks through all the bounds info and
-    /// computes the upper bound given the current direction settings
-    /// at each level.
-    const SCEV *getUpperBound(BoundInfo *Bound) const;
-
-    /// exploreDirections - Hierarchically expands the direction vector
-    /// search space, combining the directions of discovered dependences
-    /// in the DirSet field of Bound. Returns the number of distinct
-    /// dependences discovered. If the dependence is disproved,
-    /// it will return 0.
-    unsigned exploreDirections(unsigned Level,
-                               CoefficientInfo *A,
-                               CoefficientInfo *B,
-                               BoundInfo *Bound,
-                               const SmallBitVector &Loops,
-                               unsigned &DepthExpanded,
-                               const SCEV *Delta) const;
-
-    /// testBounds - Returns true iff the current bounds are plausible.
-    bool testBounds(unsigned char DirKind,
-                    unsigned Level,
-                    BoundInfo *Bound,
-                    const SCEV *Delta) const;
-
-    /// findBoundsALL - Computes the upper and lower bounds for level K
-    /// using the * direction. Records them in Bound.
-    void findBoundsALL(CoefficientInfo *A,
-                       CoefficientInfo *B,
-                       BoundInfo *Bound,
-                       unsigned K) const;
-
-    /// findBoundsLT - Computes the upper and lower bounds for level K
-    /// using the < direction. Records them in Bound.
-    void findBoundsLT(CoefficientInfo *A,
-                      CoefficientInfo *B,
-                      BoundInfo *Bound,
-                      unsigned K) const;
-
-    /// findBoundsGT - Computes the upper and lower bounds for level K
-    /// using the > direction. Records them in Bound.
-    void findBoundsGT(CoefficientInfo *A,
-                      CoefficientInfo *B,
-                      BoundInfo *Bound,
-                      unsigned K) const;
-
-    /// findBoundsEQ - Computes the upper and lower bounds for level K
-    /// using the = direction. Records them in Bound.
-    void findBoundsEQ(CoefficientInfo *A,
-                      CoefficientInfo *B,
-                      BoundInfo *Bound,
-                      unsigned K) const;
-
-    /// intersectConstraints - Updates X with the intersection
-    /// of the Constraints X and Y. Returns true if X has changed.
-    bool intersectConstraints(Constraint *X,
-                              const Constraint *Y);
-
-    /// propagate - Review the constraints, looking for opportunities
-    /// to simplify a subscript pair (Src and Dst).
-    /// Return true if some simplification occurs.
-    /// If the simplification isn't exact (that is, if it is conservative
-    /// in terms of dependence), set consistent to false.
-    bool propagate(const SCEV *&Src,
-                   const SCEV *&Dst,
-                   SmallBitVector &Loops,
-                   SmallVectorImpl<Constraint> &Constraints,
-                   bool &Consistent);
-
-    /// propagateDistance - Attempt to propagate a distance
-    /// constraint into a subscript pair (Src and Dst).
-    /// Return true if some simplification occurs.
-    /// If the simplification isn't exact (that is, if it is conservative
-    /// in terms of dependence), set consistent to false.
-    bool propagateDistance(const SCEV *&Src,
-                           const SCEV *&Dst,
-                           Constraint &CurConstraint,
-                           bool &Consistent);
-
-    /// propagatePoint - Attempt to propagate a point
-    /// constraint into a subscript pair (Src and Dst).
-    /// Return true if some simplification occurs.
-    bool propagatePoint(const SCEV *&Src,
-                        const SCEV *&Dst,
-                        Constraint &CurConstraint);
-
-    /// propagateLine - Attempt to propagate a line
-    /// constraint into a subscript pair (Src and Dst).
-    /// Return true if some simplification occurs.
-    /// If the simplification isn't exact (that is, if it is conservative
-    /// in terms of dependence), set consistent to false.
-    bool propagateLine(const SCEV *&Src,
-                       const SCEV *&Dst,
-                       Constraint &CurConstraint,
-                       bool &Consistent);
-
-    /// findCoefficient - Given a linear SCEV,
-    /// return the coefficient corresponding to specified loop.
-    /// If there isn't one, return the SCEV constant 0.
-    /// For example, given a*i + b*j + c*k, returning the coefficient
-    /// corresponding to the j loop would yield b.
-    const SCEV *findCoefficient(const SCEV *Expr,
-                                const Loop *TargetLoop) const;
-
-    /// zeroCoefficient - Given a linear SCEV,
-    /// return the SCEV given by zeroing out the coefficient
-    /// corresponding to the specified loop.
-    /// For example, given a*i + b*j + c*k, zeroing the coefficient
-    /// corresponding to the j loop would yield a*i + c*k.
-    const SCEV *zeroCoefficient(const SCEV *Expr,
-                                const Loop *TargetLoop) const;
-
-    /// addToCoefficient - Given a linear SCEV Expr,
-    /// return the SCEV given by adding some Value to the
-    /// coefficient corresponding to the specified TargetLoop.
-    /// For example, given a*i + b*j + c*k, adding 1 to the coefficient
-    /// corresponding to the j loop would yield a*i + (b+1)*j + c*k.
-    const SCEV *addToCoefficient(const SCEV *Expr,
-                                 const Loop *TargetLoop,
-                                 const SCEV *Value)  const;
-
-    /// updateDirection - Update direction vector entry
-    /// based on the current constraint.
-    void updateDirection(Dependence::DVEntry &Level,
-                         const Constraint &CurConstraint) const;
-
-    /// Given a linear access function, tries to recover subscripts
-    /// for each dimension of the array element access.
-    bool tryDelinearize(Instruction *Src, Instruction *Dst,
-                        SmallVectorImpl<Subscript> &Pair);
-
-    /// Tries to delinearize \p Src and \p Dst access functions for a fixed size
-    /// multi-dimensional array. Calls tryDelinearizeFixedSizeImpl() to
-    /// delinearize \p Src and \p Dst separately,
-    bool tryDelinearizeFixedSize(Instruction *Src, Instruction *Dst,
-                                 const SCEV *SrcAccessFn,
-                                 const SCEV *DstAccessFn,
-                                 SmallVectorImpl<const SCEV *> &SrcSubscripts,
-                                 SmallVectorImpl<const SCEV *> &DstSubscripts);
-
-    /// Tries to delinearize access function for a multi-dimensional array with
-    /// symbolic runtime sizes.
-    /// Returns true upon success and false otherwise.
-    bool tryDelinearizeParametricSize(
-        Instruction *Src, Instruction *Dst, const SCEV *SrcAccessFn,
-        const SCEV *DstAccessFn, SmallVectorImpl<const SCEV *> &SrcSubscripts,
-        SmallVectorImpl<const SCEV *> &DstSubscripts);
-
-    /// checkSubscript - Helper function for checkSrcSubscript and
-    /// checkDstSubscript to avoid duplicate code
-    bool checkSubscript(const SCEV *Expr, const Loop *LoopNest,
-                        SmallBitVector &Loops, bool IsSrc);
-  }; // class DependenceInfo
-
-  /// AnalysisPass to compute dependence information in a function
-  class DependenceAnalysis : public AnalysisInfoMixin<DependenceAnalysis> {
-  public:
-    typedef DependenceInfo Result;
-    LLVM_ABI Result run(Function &F, FunctionAnalysisManager &FAM);
+    /// getC - If constraint is a line AX + BY = C, returns C.
+    /// Otherwise assert.
+    LLVM_ABI const SCEV *getC() const;
 
-  private:
-    LLVM_ABI static AnalysisKey Key;
-    friend struct AnalysisInfoMixin<DependenceAnalysis>;
-  }; // class DependenceAnalysis
+    /// getD - If constraint is a distance, returns D.
+    /// Otherwise assert.
+    LLVM_ABI const SCEV *getD() const;
 
-  /// Printer pass to dump DA results.
-  struct DependenceAnalysisPrinterPass
-      : public PassInfoMixin<DependenceAnalysisPrinterPass> {
-    DependenceAnalysisPrinterPass(raw_ostream &OS,
-                                  bool NormalizeResults = false)
-        : OS(OS), NormalizeResults(NormalizeResults) {}
+    /// getAssociatedLoop - Returns the loop associated with this constraint.
+    LLVM_ABI const Loop *getAssociatedLoop() const;
 
-    LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+    /// setPoint - Change a constraint to Point.
+    LLVM_ABI void setPoint(const SCEV *X, const SCEV *Y,
+                           const Loop *CurrentLoop);
 
-    static bool isRequired() { return true; }
+    /// setLine - Change a constraint to Line.
+    LLVM_ABI void setLine(const SCEV *A, const SCEV *B, const SCEV *C,
+                          const Loop *CurrentLoop);
 
-  private:
-    raw_ostream &OS;
-    bool NormalizeResults;
-  }; // class DependenceAnalysisPrinterPass
+    /// setDistance - Change a constraint to Distance.
+    LLVM_ABI void setDistance(const SCEV *D, const Loop *CurrentLoop);
 
-  /// Legacy pass manager pass to access dependence information
-  class LLVM_ABI DependenceAnalysisWrapperPass : public FunctionPass {
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    DependenceAnalysisWrapperPass();
+    /// setEmpty - Change a constraint to Empty.
+    LLVM_ABI void setEmpty();
 
-    bool runOnFunction(Function &F) override;
-    void releaseMemory() override;
-    void getAnalysisUsage(AnalysisUsage &) const override;
-    void print(raw_ostream &, const Module * = nullptr) const override;
-    DependenceInfo &getDI() const;
+    /// setAny - Change a constraint to Any.
+    LLVM_ABI void setAny(ScalarEvolution *SE);
 
-  private:
-    std::unique_ptr<DependenceInfo> info;
-  }; // class DependenceAnalysisWrapperPass
+    /// dump - For debugging purposes. Dumps the constraint
+    /// out to OS.
+    LLVM_ABI void dump(raw_ostream &OS) const;
+  };
+
+  /// establishNestingLevels - Examines the loop nesting of the Src and Dst
+  /// instructions and establishes their shared loops. Sets the variables
+  /// CommonLevels, SrcLevels, and MaxLevels.
+  /// The source and destination instructions needn't be contained in the same
+  /// loop. The routine establishNestingLevels finds the level of most deeply
+  /// nested loop that contains them both, CommonLevels. An instruction that's
+  /// not contained in a loop is at level = 0. MaxLevels is equal to the level
+  /// of the source plus the level of the destination, minus CommonLevels.
+  /// This lets us allocate vectors MaxLevels in length, with room for every
+  /// distinct loop referenced in both the source and destination subscripts.
+  /// The variable SrcLevels is the nesting depth of the source instruction.
+  /// It's used to help calculate distinct loops referenced by the destination.
+  /// Here's the map from loops to levels:
+  ///            0 - unused
+  ///            1 - outermost common loop
+  ///          ... - other common loops
+  /// CommonLevels - innermost common loop
+  ///          ... - loops containing Src but not Dst
+  ///    SrcLevels - innermost loop containing Src but not Dst
+  ///          ... - loops containing Dst but not Src
+  ///    MaxLevels - innermost loop containing Dst but not Src
+  /// Consider the follow code fragment:
+  ///    for (a = ...) {
+  ///      for (b = ...) {
+  ///        for (c = ...) {
+  ///          for (d = ...) {
+  ///            A[] = ...;
+  ///          }
+  ///        }
+  ///        for (e = ...) {
+  ///          for (f = ...) {
+  ///            for (g = ...) {
+  ///              ... = A[];
+  ///            }
+  ///          }
+  ///        }
+  ///      }
+  ///    }
+  /// If we're looking at the possibility of a dependence between the store
+  /// to A (the Src) and the load from A (the Dst), we'll note that they
+  /// have 2 loops in common, so CommonLevels will equal 2 and the direction
+  /// vector for Result will have 2 entries. SrcLevels = 4 and MaxLevels = 7.
+  /// A map from loop names to level indices would look like
+  ///     a - 1
+  ///     b - 2 = CommonLevels
+  ///     c - 3
+  ///     d - 4 = SrcLevels
+  ///     e - 5
+  ///     f - 6
+  ///     g - 7 = MaxLevels
+  void establishNestingLevels(const Instruction *Src, const Instruction *Dst);
+
+  unsigned CommonLevels, SrcLevels, MaxLevels;
+
+  /// mapSrcLoop - Given one of the loops containing the source, return
+  /// its level index in our numbering scheme.
+  unsigned mapSrcLoop(const Loop *SrcLoop) const;
+
+  /// mapDstLoop - Given one of the loops containing the destination,
+  /// return its level index in our numbering scheme.
+  unsigned mapDstLoop(const Loop *DstLoop) const;
+
+  /// isLoopInvariant - Returns true if Expression is loop invariant
+  /// in LoopNest.
+  bool isLoopInvariant(const SCEV *Expression, const Loop *LoopNest) const;
+
+  /// Makes sure all subscript pairs share the same integer type by
+  /// sign-extending as necessary.
+  /// Sign-extending a subscript is safe because getelementptr assumes the
+  /// array subscripts are signed.
+  void unifySubscriptType(ArrayRef<Subscript *> Pairs);
+
+  /// removeMatchingExtensions - Examines a subscript pair.
+  /// If the source and destination are identically sign (or zero)
+  /// extended, it strips off the extension in an effort to
+  /// simplify the actual analysis.
+  void removeMatchingExtensions(Subscript *Pair);
+
+  /// collectCommonLoops - Finds the set of loops from the LoopNest that
+  /// have a level <= CommonLevels and are referred to by the SCEV Expression.
+  void collectCommonLoops(const SCEV *Expression, const Loop *LoopNest,
+                          SmallBitVector &Loops) const;
+
+  /// checkSrcSubscript - Examines the SCEV Src, returning true iff it's
+  /// linear. Collect the set of loops mentioned by Src.
+  bool checkSrcSubscript(const SCEV *Src, const Loop *LoopNest,
+                         SmallBitVector &Loops);
+
+  /// checkDstSubscript - Examines the SCEV Dst, returning true iff it's
+  /// linear. Collect the set of loops mentioned by Dst.
+  bool checkDstSubscript(const SCEV *Dst, const Loop *LoopNest,
+                         SmallBitVector &Loops);
+
+  /// isKnownPredicate - Compare X and Y using the predicate Pred.
+  /// Basically a wrapper for SCEV::isKnownPredicate,
+  /// but tries harder, especially in the presence of sign and zero
+  /// extensions and symbolics.
+  bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X,
+                        const SCEV *Y) const;
+
+  /// isKnownLessThan - Compare to see if S is less than Size
+  /// Another wrapper for isKnownNegative(S - max(Size, 1)) with some extra
+  /// checking if S is an AddRec and we can prove lessthan using the loop
+  /// bounds.
+  bool isKnownLessThan(const SCEV *S, const SCEV *Size) const;
+
+  /// isKnownNonNegative - Compare to see if S is known not to be negative
+  /// Uses the fact that S comes from Ptr, which may be an inbound GEP,
+  /// Proving there is no wrapping going on.
+  bool isKnownNonNegative(const SCEV *S, const Value *Ptr) const;
+
+  /// collectUpperBound - All subscripts are the same type (on my machine,
+  /// an i64). The loop bound may be a smaller type. collectUpperBound
+  /// find the bound, if available, and zero extends it to the Type T.
+  /// (I zero extend since the bound should always be >= 0.)
+  /// If no upper bound is available, return NULL.
+  const SCEV *collectUpperBound(const Loop *l, Type *T) const;
+
+  /// collectConstantUpperBound - Calls collectUpperBound(), then
+  /// attempts to cast it to SCEVConstant. If the cast fails,
+  /// returns NULL.
+  const SCEVConstant *collectConstantUpperBound(const Loop *l, Type *T) const;
+
+  /// classifyPair - Examines the subscript pair (the Src and Dst SCEVs)
+  /// and classifies it as either ZIV, SIV, RDIV, MIV, or Nonlinear.
+  /// Collects the associated loops in a set.
+  Subscript::ClassificationKind
+  classifyPair(const SCEV *Src, const Loop *SrcLoopNest, const SCEV *Dst,
+               const Loop *DstLoopNest, SmallBitVector &Loops);
+
+  /// testZIV - Tests the ZIV subscript pair (Src and Dst) for dependence.
+  /// Returns true if any possible dependence is disproved.
+  /// If there might be a dependence, returns false.
+  /// If the dependence isn't proven to exist,
+  /// marks the Result as inconsistent.
+  bool testZIV(const SCEV *Src, const SCEV *Dst, FullDependence &Result) const;
+
+  /// testSIV - Tests the SIV subscript pair (Src and Dst) for dependence.
+  /// Things of the form [c1 + a1*i] and [c2 + a2*j], where
+  /// i and j are induction variables, c1 and c2 are loop invariant,
+  /// and a1 and a2 are constant.
+  /// Returns true if any possible dependence is disproved.
+  /// If there might be a dependence, returns false.
+  /// Sets appropriate direction vector entry and, when possible,
+  /// the distance vector entry.
+  /// If the dependence isn't proven to exist,
+  /// marks the Result as inconsistent.
+  bool testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
+               FullDependence &Result, Constraint &NewConstraint,
+               const SCEV *&SplitIter) const;
+
+  /// testRDIV - Tests the RDIV subscript pair (Src and Dst) for dependence.
+  /// Things of the form [c1 + a1*i] and [c2 + a2*j]
+  /// where i and j are induction variables, c1 and c2 are loop invariant,
+  /// and a1 and a2 are constant.
+  /// With minor algebra, this test can also be used for things like
+  /// [c1 + a1*i + a2*j][c2].
+  /// Returns true if any possible dependence is disproved.
+  /// If there might be a dependence, returns false.
+  /// Marks the Result as inconsistent.
+  bool testRDIV(const SCEV *Src, const SCEV *Dst, FullDependence &Result) const;
+
+  /// testMIV - Tests the MIV subscript pair (Src and Dst) for dependence.
+  /// Returns true if dependence disproved.
+  /// Can sometimes refine direction vectors.
+  bool testMIV(const SCEV *Src, const SCEV *Dst, const SmallBitVector &Loops,
+               FullDependence &Result) const;
+
+  /// strongSIVtest - Tests the strong SIV subscript pair (Src and Dst)
+  /// for dependence.
+  /// Things of the form [c1 + a*i] and [c2 + a*i],
+  /// where i is an induction variable, c1 and c2 are loop invariant,
+  /// and a is a constant
+  /// Returns true if any possible dependence is disproved.
+  /// If there might be a dependence, returns false.
+  /// Sets appropriate direction and distance.
+  bool strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
+                     const SCEV *DstConst, const Loop *CurrentLoop,
+                     unsigned Level, FullDependence &Result,
+                     Constraint &NewConstraint) const;
+
+  /// weakCrossingSIVtest - Tests the weak-crossing SIV subscript pair
+  /// (Src and Dst) for dependence.
+  /// Things of the form [c1 + a*i] and [c2 - a*i],
+  /// where i is an induction variable, c1 and c2 are loop invariant,
+  /// and a is a constant.
+  /// Returns true if any possible dependence is disproved.
+  /// If there might be a dependence, returns false.
+  /// Sets appropriate direction entry.
+  /// Set consistent to false.
+  /// Marks the dependence as splitable.
+  bool weakCrossingSIVtest(const SCEV *SrcCoeff, const SCEV *SrcConst,
+                           const SCEV *DstConst, const Loop *CurrentLoop,
+                           unsigned Level, FullDependence &Result,
+                           Constraint &NewConstraint,
+                           const SCEV *&SplitIter) const;
+
+  /// ExactSIVtest - Tests the SIV subscript pair
+  /// (Src and Dst) for dependence.
+  /// Things of the form [c1 + a1*i] and [c2 + a2*i],
+  /// where i is an induction variable, c1 and c2 are loop invariant,
+  /// and a1 and a2 are constant.
+  /// Returns true if any possible dependence is disproved.
+  /// If there might be a dependence, returns false.
+  /// Sets appropriate direction entry.
+  /// Set consistent to false.
+  bool exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
+                    const SCEV *SrcConst, const SCEV *DstConst,
+                    const Loop *CurrentLoop, unsigned Level,
+                    FullDependence &Result, Constraint &NewConstraint) const;
+
+  /// weakZeroSrcSIVtest - Tests the weak-zero SIV subscript pair
+  /// (Src and Dst) for dependence.
+  /// Things of the form [c1] and [c2 + a*i],
+  /// where i is an induction variable, c1 and c2 are loop invariant,
+  /// and a is a constant. See also weakZeroDstSIVtest.
+  /// Returns true if any possible dependence is disproved.
+  /// If there might be a dependence, returns false.
+  /// Sets appropriate direction entry.
+  /// Set consistent to false.
+  /// If loop peeling will break the dependence, mark appropriately.
+  bool weakZeroSrcSIVtest(const SCEV *DstCoeff, const SCEV *SrcConst,
+                          const SCEV *DstConst, const Loop *CurrentLoop,
+                          unsigned Level, FullDependence &Result,
+                          Constraint &NewConstraint) const;
+
+  /// weakZeroDstSIVtest - Tests the weak-zero SIV subscript pair
+  /// (Src and Dst) for dependence.
+  /// Things of the form [c1 + a*i] and [c2],
+  /// where i is an induction variable, c1 and c2 are loop invariant,
+  /// and a is a constant. See also weakZeroSrcSIVtest.
+  /// Returns true if any possible dependence is disproved.
+  /// If there might be a dependence, returns false.
+  /// Sets appropriate direction entry.
+  /// Set consistent to false.
+  /// If loop peeling will break the dependence, mark appropriately.
+  bool weakZeroDstSIVtest(const SCEV *SrcCoeff, const SCEV *SrcConst,
+                          const SCEV *DstConst, const Loop *CurrentLoop,
+                          unsigned Level, FullDependence &Result,
+                          Constraint &NewConstraint) const;
+
+  /// exactRDIVtest - Tests the RDIV subscript pair for dependence.
+  /// Things of the form [c1 + a*i] and [c2 + b*j],
+  /// where i and j are induction variable, c1 and c2 are loop invariant,
+  /// and a and b are constants.
+  /// Returns true if any possible dependence is disproved.
+  /// Marks the result as inconsistent.
+  /// Works in some cases that symbolicRDIVtest doesn't,
+  /// and vice versa.
+  bool exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
+                     const SCEV *SrcConst, const SCEV *DstConst,
+                     const Loop *SrcLoop, const Loop *DstLoop,
+                     FullDependence &Result) const;
+
+  /// symbolicRDIVtest - Tests the RDIV subscript pair for dependence.
+  /// Things of the form [c1 + a*i] and [c2 + b*j],
+  /// where i and j are induction variable, c1 and c2 are loop invariant,
+  /// and a and b are constants.
+  /// Returns true if any possible dependence is disproved.
+  /// Marks the result as inconsistent.
+  /// Works in some cases that exactRDIVtest doesn't,
+  /// and vice versa. Can also be used as a backup for
+  /// ordinary SIV tests.
+  bool symbolicRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
+                        const SCEV *SrcConst, const SCEV *DstConst,
+                        const Loop *SrcLoop, const Loop *DstLoop) const;
+
+  /// gcdMIVtest - Tests an MIV subscript pair for dependence.
+  /// Returns true if any possible dependence is disproved.
+  /// Marks the result as inconsistent.
+  /// Can sometimes disprove the equal direction for 1 or more loops.
+  //  Can handle some symbolics that even the SIV tests don't get,
+  /// so we use it as a backup for everything.
+  bool gcdMIVtest(const SCEV *Src, const SCEV *Dst,
+                  FullDependence &Result) const;
+
+  /// banerjeeMIVtest - Tests an MIV subscript pair for dependence.
+  /// Returns true if any possible dependence is disproved.
+  /// Marks the result as inconsistent.
+  /// Computes directions.
+  bool banerjeeMIVtest(const SCEV *Src, const SCEV *Dst,
+                       const SmallBitVector &Loops,
+                       FullDependence &Result) const;
 
-  /// createDependenceAnalysisPass - This creates an instance of the
-  /// DependenceAnalysis wrapper pass.
-  LLVM_ABI FunctionPass *createDependenceAnalysisWrapperPass();
+  /// collectCoeffInfo - Walks through the subscript, collecting each
+  /// coefficient, the associated loop bounds, and recording its positive and
+  /// negative parts for later use.
+  CoefficientInfo *collectCoeffInfo(const SCEV *Subscript, bool SrcFlag,
+                                    const SCEV *&Constant) const;
+
+  /// getPositivePart - X^+ = max(X, 0).
+  const SCEV *getPositivePart(const SCEV *X) const;
+
+  /// getNegativePart - X^- = min(X, 0).
+  const SCEV *getNegativePart(const SCEV *X) const;
+
+  /// getLowerBound - Looks through all the bounds info and
+  /// computes the lower bound given the current direction settings
+  /// at each level.
+  const SCEV *getLowerBound(BoundInfo *Bound) const;
+
+  /// getUpperBound - Looks through all the bounds info and
+  /// computes the upper bound given the current direction settings
+  /// at each level.
+  const SCEV *getUpperBound(BoundInfo *Bound) const;
+
+  /// exploreDirections - Hierarchically expands the direction vector
+  /// search space, combining the directions of discovered dependences
+  /// in the DirSet field of Bound. Returns the number of distinct
+  /// dependences discovered. If the dependence is disproved,
+  /// it will return 0.
+  unsigned exploreDirections(unsigned Level, CoefficientInfo *A,
+                             CoefficientInfo *B, BoundInfo *Bound,
+                             const SmallBitVector &Loops,
+                             unsigned &DepthExpanded, const SCEV *Delta) const;
+
+  /// testBounds - Returns true iff the current bounds are plausible.
+  bool testBounds(unsigned char DirKind, unsigned Level, BoundInfo *Bound,
+                  const SCEV *Delta) const;
+
+  /// findBoundsALL - Computes the upper and lower bounds for level K
+  /// using the * direction. Records them in Bound.
+  void findBoundsALL(CoefficientInfo *A, CoefficientInfo *B, BoundInfo *Bound,
+                     unsigned K) const;
+
+  /// findBoundsLT - Computes the upper and lower bounds for level K
+  /// using the < direction. Records them in Bound.
+  void findBoundsLT(CoefficientInfo *A, CoefficientInfo *B, BoundInfo *Bound,
+                    unsigned K) const;
+
+  /// findBoundsGT - Computes the upper and lower bounds for level K
+  /// using the > direction. Records them in Bound.
+  void findBoundsGT(CoefficientInfo *A, CoefficientInfo *B, BoundInfo *Bound,
+                    unsigned K) const;
+
+  /// findBoundsEQ - Computes the upper and lower bounds for level K
+  /// using the = direction. Records them in Bound.
+  void findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B, BoundInfo *Bound,
+                    unsigned K) const;
+
+  /// intersectConstraints - Updates X with the intersection
+  /// of the Constraints X and Y. Returns true if X has changed.
+  bool intersectConstraints(Constraint *X, const Constraint *Y);
+
+  /// propagate - Review the constraints, looking for opportunities
+  /// to simplify a subscript pair (Src and Dst).
+  /// Return true if some simplification occurs.
+  /// If the simplification isn't exact (that is, if it is conservative
+  /// in terms of dependence), set consistent to false.
+  bool propagate(const SCEV *&Src, const SCEV *&Dst, SmallBitVector &Loops,
+                 SmallVectorImpl<Constraint> &Constraints, bool &Consistent);
+
+  /// propagateDistance - Attempt to propagate a distance
+  /// constraint into a subscript pair (Src and Dst).
+  /// Return true if some simplification occurs.
+  /// If the simplification isn't exact (that is, if it is conservative
+  /// in terms of dependence), set consistent to false.
+  bool propagateDistance(const SCEV *&Src, const SCEV *&Dst,
+                         Constraint &CurConstraint, bool &Consistent);
+
+  /// propagatePoint - Attempt to propagate a point
+  /// constraint into a subscript pair (Src and Dst).
+  /// Return true if some simplification occurs.
+  bool propagatePoint(const SCEV *&Src, const SCEV *&Dst,
+                      Constraint &CurConstraint);
+
+  /// propagateLine - Attempt to propagate a line
+  /// constraint into a subscript pair (Src and Dst).
+  /// Return true if some simplification occurs.
+  /// If the simplification isn't exact (that is, if it is conservative
+  /// in terms of dependence), set consistent to false.
+  bool propagateLine(const SCEV *&Src, const SCEV *&Dst,
+                     Constraint &CurConstraint, bool &Consistent);
+
+  /// findCoefficient - Given a linear SCEV,
+  /// return the coefficient corresponding to specified loop.
+  /// If there isn't one, return the SCEV constant 0.
+  /// For example, given a*i + b*j + c*k, returning the coefficient
+  /// corresponding to the j loop would yield b.
+  const SCEV *findCoefficient(const SCEV *Expr, const Loop *TargetLoop) const;
+
+  /// zeroCoefficient - Given a linear SCEV,
+  /// return the SCEV given by zeroing out the coefficient
+  /// corresponding to the specified loop.
+  /// For example, given a*i + b*j + c*k, zeroing the coefficient
+  /// corresponding to the j loop would yield a*i + c*k.
+  const SCEV *zeroCoefficient(const SCEV *Expr, const Loop *TargetLoop) const;
+
+  /// addToCoefficient - Given a linear SCEV Expr,
+  /// return the SCEV given by adding some Value to the
+  /// coefficient corresponding to the specified TargetLoop.
+  /// For example, given a*i + b*j + c*k, adding 1 to the coefficient
+  /// corresponding to the j loop would yield a*i + (b+1)*j + c*k.
+  const SCEV *addToCoefficient(const SCEV *Expr, const Loop *TargetLoop,
+                               const SCEV *Value) const;
+
+  /// updateDirection - Update direction vector entry
+  /// based on the current constraint.
+  void updateDirection(Dependence::DVEntry &Level,
+                       const Constraint &CurConstraint) const;
+
+  /// Given a linear access function, tries to recover subscripts
+  /// for each dimension of the array element access.
+  bool tryDelinearize(Instruction *Src, Instruction *Dst,
+                      SmallVectorImpl<Subscript> &Pair);
+
+  /// Tries to delinearize \p Src and \p Dst access functions for a fixed size
+  /// multi-dimensional array. Calls tryDelinearizeFixedSizeImpl() to
+  /// delinearize \p Src and \p Dst separately,
+  bool tryDelinearizeFixedSize(Instruction *Src, Instruction *Dst,
+                               const SCEV *SrcAccessFn, const SCEV *DstAccessFn,
+                               SmallVectorImpl<const SCEV *> &SrcSubscripts,
+                               SmallVectorImpl<const SCEV *> &DstSubscripts);
+
+  /// Tries to delinearize access function for a multi-dimensional array with
+  /// symbolic runtime sizes.
+  /// Returns true upon success and false otherwise.
+  bool
+  tryDelinearizeParametricSize(Instruction *Src, Instruction *Dst,
+                               const SCEV *SrcAccessFn, const SCEV *DstAccessFn,
+                               SmallVectorImpl<const SCEV *> &SrcSubscripts,
+                               SmallVectorImpl<const SCEV *> &DstSubscripts);
+
+  /// checkSubscript - Helper function for checkSrcSubscript and
+  /// checkDstSubscript to avoid duplicate code
+  bool checkSubscript(const SCEV *Expr, const Loop *LoopNest,
+                      SmallBitVector &Loops, bool IsSrc);
+}; // class DependenceInfo
+
+/// AnalysisPass to compute dependence information in a function
+class DependenceAnalysis : public AnalysisInfoMixin<DependenceAnalysis> {
+public:
+  typedef DependenceInfo Result;
+  LLVM_ABI Result run(Function &F, FunctionAnalysisManager &FAM);
+
+private:
+  LLVM_ABI static AnalysisKey Key;
+  friend struct AnalysisInfoMixin<DependenceAnalysis>;
+}; // class DependenceAnalysis
+
+/// Printer pass to dump DA results.
+struct DependenceAnalysisPrinterPass
+    : public PassInfoMixin<DependenceAnalysisPrinterPass> {
+  DependenceAnalysisPrinterPass(raw_ostream &OS, bool NormalizeResults = false)
+      : OS(OS), NormalizeResults(NormalizeResults) {}
+
+  LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+
+  static bool isRequired() { return true; }
+
+private:
+  raw_ostream &OS;
+  bool NormalizeResults;
+}; // class DependenceAnalysisPrinterPass
+
+/// Legacy pass manager pass to access dependence information
+class LLVM_ABI DependenceAnalysisWrapperPass : public FunctionPass {
+public:
+  static char ID; // Class identification, replacement for typeinfo
+  DependenceAnalysisWrapperPass();
+
+  bool runOnFunction(Function &F) override;
+  void releaseMemory() override;
+  void getAnalysisUsage(AnalysisUsage &) const override;
+  void print(raw_ostream &, const Module * = nullptr) const override;
+  DependenceInfo &getDI() const;
+
+private:
+  std::unique_ptr<DependenceInfo> info;
+}; // class DependenceAnalysisWrapperPass
+
+/// createDependenceAnalysisPass - This creates an instance of the
+/// DependenceAnalysis wrapper pass.
+LLVM_ABI FunctionPass *createDependenceAnalysisWrapperPass();
 
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 6f2ad33..3275e32 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1260,6 +1260,9 @@ public:
   /// stack arguments from being clobbered.
   LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain);
 
+  std::pair<SDValue, SDValue> getMemcmp(SDValue Chain, const SDLoc &dl,
+                                        SDValue Dst, SDValue Src, SDValue Size,
+                                        const CallInst *CI);
   /* \p CI if not null is the memset call being lowered.
    * \p OverrideTailCall is an optional parameter that can be used to override
    * the tail call optimization decision. */
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
index 463f0ec..fd00f81 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
@@ -23,6 +23,7 @@
 
 namespace llvm {
 
+class CallInst;
 class SelectionDAG;
 
 //===----------------------------------------------------------------------===//
@@ -118,8 +119,7 @@ public:
   virtual std::pair<SDValue, SDValue>
   EmitTargetCodeForMemcmp(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
                           SDValue Op1, SDValue Op2, SDValue Op3,
-                          MachinePointerInfo Op1PtrInfo,
-                          MachinePointerInfo Op2PtrInfo) const {
+                          const CallInst *CI) const {
     return std::make_pair(SDValue(), SDValue());
   }
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 206ad4a..b681ea8 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -3286,7 +3286,8 @@ private:
   emitAtomicUpdate(InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
                    AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
                    AtomicUpdateCallbackTy &UpdateOp, bool VolatileX,
-                   bool IsXBinopExpr);
+                   bool IsXBinopExpr, bool IsIgnoreDenormalMode,
+                   bool IsFineGrainedMemory, bool IsRemoteMemory);
 
   /// Emit the binary op. described by \p RMWOp, using \p Src1 and \p Src2 .
   ///
@@ -3359,7 +3360,9 @@ public:
   LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(
       const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
       Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
-      AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr);
+      AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
+      bool IsIgnoreDenormalMode = false, bool IsFineGrainedMemory = false,
+      bool IsRemoteMemory = false);
 
   /// Emit atomic update for constructs: --- Only Scalar data types
   /// V = X; X = X BinOp Expr ,
@@ -3394,7 +3397,9 @@ public:
       const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
       AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
       AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
-      bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr);
+      bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
+      bool IsIgnoreDenormalMode = false, bool IsFineGrainedMemory = false,
+      bool IsRemoteMemory = false);
 
   /// Emit atomic compare for constructs: --- Only scalar data types
   /// cond-expr-stmt:
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 967d166..1bcc442 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -137,6 +137,7 @@
 
 def llvm_global_ptr_ty  : LLVMQualPointerType<1>;         // (global)ptr
 def llvm_shared_ptr_ty  : LLVMQualPointerType<3>;         // (shared)ptr
+def llvm_constant_ptr_ty: LLVMQualPointerType<4>;         // (const)ptr
 def llvm_local_ptr_ty   : LLVMQualPointerType<5>;         // (local)ptr
 def llvm_tmem_ptr_ty    : LLVMQualPointerType<6>;         // (tensor memory)ptr
 def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>;  // (shared_cluster)ptr
@@ -2212,15 +2213,17 @@ def int_nvvm_cp_async_bulk_tensor_prefetch_tile_gather4_2d
 // Intrinsics for Prefetch and Prefetchu
 let IntrProperties = [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>] in {
   foreach level = ["L1", "L2"] in {
-    def int_nvvm_prefetch_ # level : Intrinsic<[], [llvm_ptr_ty]>;
-    def int_nvvm_prefetch_global_ # level : Intrinsic<[], [llvm_global_ptr_ty]>;
-    def int_nvvm_prefetch_local_ # level : Intrinsic<[], [llvm_local_ptr_ty]>;
+    def int_nvvm_prefetch_ # level : DefaultAttrsIntrinsic<[], [llvm_ptr_ty]>;
+    def int_nvvm_prefetch_global_ # level : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty]>;
+    def int_nvvm_prefetch_local_ # level : DefaultAttrsIntrinsic<[], [llvm_local_ptr_ty]>;
   }
 
+  def int_nvvm_prefetch_tensormap : DefaultAttrsIntrinsic<[], [llvm_anyptr_ty]>;
+  
   foreach eviction_priority = ["evict_normal", "evict_last"] in
-    def int_nvvm_prefetch_global_L2_ # eviction_priority : Intrinsic<[], [llvm_global_ptr_ty]>;
+    def int_nvvm_prefetch_global_L2_ # eviction_priority : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty]>;
 
-  def int_nvvm_prefetchu_L1 : Intrinsic<[], [llvm_ptr_ty]>;
+  def int_nvvm_prefetchu_L1 : DefaultAttrsIntrinsic<[], [llvm_ptr_ty]>;
 }
 
 // applypriority
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index df472d4..eadf3ea 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -276,6 +276,7 @@ foreach FPTy = ["F32", "F64", "F128", "PPCF128"] in {
 }
 
 // Memory
+def MEMCMP : RuntimeLibcall;
 def MEMCPY : RuntimeLibcall;
 def MEMMOVE : RuntimeLibcall;
 def MEMSET : RuntimeLibcall;
@@ -1990,12 +1991,14 @@ defset list<RuntimeLibcallImpl> PPCRuntimeLibcalls = {
 }
 
 defset list<RuntimeLibcallImpl> PPC64AIXCallList = {
+  def ___memcmp64 : RuntimeLibcallImpl<MEMCMP>;
   def ___memmove64 : RuntimeLibcallImpl<MEMCPY>;
   def ___memset64 : RuntimeLibcallImpl<MEMSET>;
   def ___bzero64 : RuntimeLibcallImpl<BZERO>;
 }
 
 defset list<RuntimeLibcallImpl> PPC32AIXCallList = {
+  def ___memcmp : RuntimeLibcallImpl<MEMCMP>;
   def ___memmove : RuntimeLibcallImpl<MEMMOVE>;
   def ___memset : RuntimeLibcallImpl<MEMSET>;
   def ___bzero : RuntimeLibcallImpl<BZERO>;
diff --git a/llvm/include/llvm/Support/DebugLog.h b/llvm/include/llvm/Support/DebugLog.h
index a331295..a94e578 100644
--- a/llvm/include/llvm/Support/DebugLog.h
+++ b/llvm/include/llvm/Support/DebugLog.h
@@ -56,6 +56,16 @@ namespace llvm {
   DEBUGLOG_WITH_STREAM_AND_TYPE(llvm::dbgs(), LEVEL, DEBUG_TYPE)
 #define LDBG_LOG_LEVEL_1() LDBG_LOG_LEVEL(1)
 
+// We want the filename without the full path. We are using the __FILE__ macro
+// and a constexpr function to strip the path prefix. We can avoid the frontend
+// repeated evaluation of __FILE__ by using the __FILE_NAME__ when defined
+// (gcc and clang do) which contains the file name already.
+#if defined(__FILE_NAME__)
+#define __LLVM_FILE_NAME__ __FILE_NAME__
+#else
+#define __LLVM_FILE_NAME__ ::llvm::impl::getShortFileName(__FILE__)
+#endif
+
 #define DEBUGLOG_WITH_STREAM_TYPE_FILE_AND_LINE(STREAM, LEVEL, TYPE, FILE,     \
                                                 LINE)                          \
   for (bool _c =                                                               \
@@ -69,17 +79,8 @@ namespace llvm {
 
 #define DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, LEVEL, TYPE, FILE)          \
   DEBUGLOG_WITH_STREAM_TYPE_FILE_AND_LINE(STREAM, LEVEL, TYPE, FILE, __LINE__)
-// When __SHORT_FILE__ is not defined, the File is the full path,
-// otherwise __SHORT_FILE__ is defined in CMake to provide the file name
-// without the path prefix.
-#if defined(__SHORT_FILE__)
 #define DEBUGLOG_WITH_STREAM_AND_TYPE(STREAM, LEVEL, TYPE)                     \
-  DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, LEVEL, TYPE, __SHORT_FILE__)
-#else
-#define DEBUGLOG_WITH_STREAM_AND_TYPE(STREAM, LEVEL, TYPE)                     \
-  DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, LEVEL, TYPE,                      \
-                                     ::llvm::impl::getShortFileName(__FILE__))
-#endif
+  DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, LEVEL, TYPE, __LLVM_FILE_NAME__)
 
 namespace impl {
 
diff --git a/llvm/include/llvm/Support/GraphWriter.h b/llvm/include/llvm/Support/GraphWriter.h
index 39a4c0b..af2e501 100644
--- a/llvm/include/llvm/Support/GraphWriter.h
+++ b/llvm/include/llvm/Support/GraphWriter.h
@@ -61,8 +61,7 @@ enum Name {
 LLVM_ABI bool DisplayGraph(StringRef Filename, bool wait = true,
                            GraphProgram::Name program = GraphProgram::DOT);
 
-template<typename GraphType>
-class GraphWriter {
+template <typename GraphType, typename Derived> class GraphWriterBase {
   raw_ostream &O;
   const GraphType &G;
   bool RenderUsingHTML = false;
@@ -75,9 +74,15 @@ class GraphWriter {
   DOTTraits DTraits;
 
   static_assert(std::is_pointer_v<NodeRef>,
-                "FIXME: Currently GraphWriter requires the NodeRef type to be "
-                "a pointer.\nThe pointer usage should be moved to "
-                "DOTGraphTraits, and removed from GraphWriter itself.");
+                "FIXME: Currently GraphWriterBase requires the NodeRef type to "
+                "be a pointer.\nThe pointer usage should be moved to "
+                "DOTGraphTraits, and removed from GraphWriterBase itself.");
+
+  // Cast the 'this' pointer to the derived type and return a reference.
+  Derived &getDerived() { return *static_cast<Derived *>(this); }
+  const Derived &getDerived() const {
+    return *static_cast<const Derived *>(this);
+  }
 
   // Writes the edge labels of the node to O and returns true if there are any
   // edge labels not equal to the empty string "".
@@ -118,23 +123,24 @@ class GraphWriter {
   }
 
 public:
-  GraphWriter(raw_ostream &o, const GraphType &g, bool SN) : O(o), G(g) {
+  GraphWriterBase(raw_ostream &o, const GraphType &g, bool SN) : O(o), G(g) {
     DTraits = DOTTraits(SN);
     RenderUsingHTML = DTraits.renderNodesUsingHTML();
   }
+  virtual ~GraphWriterBase() {}
 
   void writeGraph(const std::string &Title = "") {
     // Output the header for the graph...
-    writeHeader(Title);
+    getDerived().writeHeader(Title);
 
     // Emit all of the nodes in the graph...
-    writeNodes();
+    getDerived().writeNodes();
 
     // Output any customizations on the graph
-    DOTGraphTraits<GraphType>::addCustomGraphFeatures(G, *this);
+    DOTGraphTraits<GraphType>::addCustomGraphFeatures(G, getDerived());
 
     // Output the end of the graph
-    writeFooter();
+    getDerived().writeFooter();
   }
 
   void writeHeader(const std::string &Title) {
@@ -166,8 +172,8 @@ public:
   void writeNodes() {
     // Loop over the graph, printing it out...
     for (const auto Node : nodes<GraphType>(G))
-      if (!isNodeHidden(Node))
-        writeNode(Node);
+      if (!getDerived().isNodeHidden(Node))
+        getDerived().writeNode(Node);
   }
 
   bool isNodeHidden(NodeRef Node) { return DTraits.isNodeHidden(Node, G); }
@@ -302,9 +308,9 @@ public:
       if (DTraits.getEdgeSourceLabel(Node, EI).empty())
         edgeidx = -1;
 
-      emitEdge(static_cast<const void*>(Node), edgeidx,
-               static_cast<const void*>(TargetNode), DestPort,
-               DTraits.getEdgeAttributes(Node, EI, G));
+      getDerived().emitEdge(static_cast<const void *>(Node), edgeidx,
+                            static_cast<const void *>(TargetNode), DestPort,
+                            DTraits.getEdgeAttributes(Node, EI, G));
     }
   }
 
@@ -357,10 +363,17 @@ public:
   }
 };
 
-template<typename GraphType>
+template <typename GraphType>
+class GraphWriter : public GraphWriterBase<GraphType, GraphWriter<GraphType>> {
+public:
+  GraphWriter(raw_ostream &o, const GraphType &g, bool SN)
+      : GraphWriterBase<GraphType, GraphWriter<GraphType>>(o, g, SN) {}
+  ~GraphWriter() override {}
+};
+
+template <typename GraphType>
 raw_ostream &WriteGraph(raw_ostream &O, const GraphType &G,
-                        bool ShortNames = false,
-                        const Twine &Title = "") {
+                        bool ShortNames = false, const Twine &Title = "") {
   // Start the graph emission process...
   GraphWriter<GraphType> W(O, G, ShortNames);
 
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index f1473b2..256befa 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -180,8 +180,8 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
   for (inst_iterator SrcI = inst_begin(F), SrcE = inst_end(F); SrcI != SrcE;
        ++SrcI) {
     if (SrcI->mayReadOrWriteMemory()) {
-      for (inst_iterator DstI = SrcI, DstE = inst_end(F);
-           DstI != DstE; ++DstI) {
+      for (inst_iterator DstI = SrcI, DstE = inst_end(F); DstI != DstE;
+           ++DstI) {
         if (DstI->mayReadOrWriteMemory()) {
           OS << "Src:" << *SrcI << " --> Dst:" << *DstI << "\n";
           OS << "  da analyze - ";
@@ -203,7 +203,7 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
 
             // Normalize negative direction vectors if required by clients.
             if (NormalizeResults && D->normalize(&SE))
-                OS << "normalized - ";
+              OS << "normalized - ";
             D->dump(OS);
             for (unsigned Level = 1; Level <= D->getLevels(); Level++) {
               if (D->isSplitable(Level)) {
@@ -227,8 +227,8 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
 
 void DependenceAnalysisWrapperPass::print(raw_ostream &OS,
                                           const Module *) const {
-  dumpExampleDependence(OS, info.get(),
-                        getAnalysis<ScalarEvolutionWrapperPass>().getSE(), false);
+  dumpExampleDependence(
+      OS, info.get(), getAnalysis<ScalarEvolutionWrapperPass>().getSE(), false);
 }
 
 PreservedAnalyses
@@ -249,33 +249,26 @@ bool Dependence::isInput() const {
   return Src->mayReadFromMemory() && Dst->mayReadFromMemory();
 }
 
-
 // Returns true if this is an output dependence.
 bool Dependence::isOutput() const {
   return Src->mayWriteToMemory() && Dst->mayWriteToMemory();
 }
 
-
 // Returns true if this is an flow (aka true)  dependence.
 bool Dependence::isFlow() const {
   return Src->mayWriteToMemory() && Dst->mayReadFromMemory();
 }
 
-
 // Returns true if this is an anti dependence.
 bool Dependence::isAnti() const {
   return Src->mayReadFromMemory() && Dst->mayWriteToMemory();
 }
 
-
 // Returns true if a particular level is scalar; that is,
 // if no subscript in the source or destination mention the induction
 // variable associated with the loop at this level.
 // Leave this out of line, so it will serve as a virtual method anchor
-bool Dependence::isScalar(unsigned level) const {
-  return false;
-}
-
+bool Dependence::isScalar(unsigned level) const { return false; }
 
 //===----------------------------------------------------------------------===//
 // FullDependence methods
@@ -338,8 +331,7 @@ bool FullDependence::normalize(ScalarEvolution *SE) {
     DV[Level - 1].Direction = RevDirection;
     // Reverse the dependence distance as well.
     if (DV[Level - 1].Distance != nullptr)
-      DV[Level - 1].Distance =
-          SE->getNegativeSCEV(DV[Level - 1].Distance);
+      DV[Level - 1].Distance = SE->getNegativeSCEV(DV[Level - 1].Distance);
   }
 
   LLVM_DEBUG(dbgs() << "After normalizing negative direction vectors:\n";
@@ -355,14 +347,12 @@ unsigned FullDependence::getDirection(unsigned Level) const {
   return DV[Level - 1].Direction;
 }
 
-
 // Returns the distance (or NULL) associated with a particular level.
 const SCEV *FullDependence::getDistance(unsigned Level) const {
   assert(0 < Level && Level <= Levels && "Level out of range");
   return DV[Level - 1].Distance;
 }
 
-
 // Returns true if a particular level is scalar; that is,
 // if no subscript in the source or destination mention the induction
 // variable associated with the loop at this level.
@@ -371,7 +361,6 @@ bool FullDependence::isScalar(unsigned Level) const {
   return DV[Level - 1].Scalar;
 }
 
-
 // Returns true if peeling the first iteration from this loop
 // will break this dependence.
 bool FullDependence::isPeelFirst(unsigned Level) const {
@@ -379,7 +368,6 @@ bool FullDependence::isPeelFirst(unsigned Level) const {
   return DV[Level - 1].PeelFirst;
 }
 
-
 // Returns true if peeling the last iteration from this loop
 // will break this dependence.
 bool FullDependence::isPeelLast(unsigned Level) const {
@@ -387,14 +375,12 @@ bool FullDependence::isPeelLast(unsigned Level) const {
   return DV[Level - 1].PeelLast;
 }
 
-
 // Returns true if splitting this loop will break the dependence.
 bool FullDependence::isSplitable(unsigned Level) const {
   assert(0 < Level && Level <= Levels && "Level out of range");
   return DV[Level - 1].Splitable;
 }
 
-
 //===----------------------------------------------------------------------===//
 // DependenceInfo::Constraint methods
 
@@ -405,7 +391,6 @@ const SCEV *DependenceInfo::Constraint::getX() const {
   return A;
 }
 
-
 // If constraint is a point <X, Y>, returns Y.
 // Otherwise assert.
 const SCEV *DependenceInfo::Constraint::getY() const {
@@ -413,7 +398,6 @@ const SCEV *DependenceInfo::Constraint::getY() const {
   return B;
 }
 
-
 // If constraint is a line AX + BY = C, returns A.
 // Otherwise assert.
 const SCEV *DependenceInfo::Constraint::getA() const {
@@ -422,7 +406,6 @@ const SCEV *DependenceInfo::Constraint::getA() const {
   return A;
 }
 
-
 // If constraint is a line AX + BY = C, returns B.
 // Otherwise assert.
 const SCEV *DependenceInfo::Constraint::getB() const {
@@ -431,7 +414,6 @@ const SCEV *DependenceInfo::Constraint::getB() const {
   return B;
 }
 
-
 // If constraint is a line AX + BY = C, returns C.
 // Otherwise assert.
 const SCEV *DependenceInfo::Constraint::getC() const {
@@ -440,7 +422,6 @@ const SCEV *DependenceInfo::Constraint::getC() const {
   return C;
 }
 
-
 // If constraint is a distance, returns D.
 // Otherwise assert.
 const SCEV *DependenceInfo::Constraint::getD() const {
@@ -448,7 +429,6 @@ const SCEV *DependenceInfo::Constraint::getD() const {
   return SE->getNegativeSCEV(C);
 }
 
-
 // Returns the loop associated with this constraint.
 const Loop *DependenceInfo::Constraint::getAssociatedLoop() const {
   assert((Kind == Distance || Kind == Line || Kind == Point) &&
@@ -499,17 +479,16 @@ LLVM_DUMP_METHOD void DependenceInfo::Constraint::dump(raw_ostream &OS) const {
   else if (isPoint())
     OS << " Point is <" << *getX() << ", " << *getY() << ">\n";
   else if (isDistance())
-    OS << " Distance is " << *getD() <<
-      " (" << *getA() << "*X + " << *getB() << "*Y = " << *getC() << ")\n";
+    OS << " Distance is " << *getD() << " (" << *getA() << "*X + " << *getB()
+       << "*Y = " << *getC() << ")\n";
   else if (isLine())
-    OS << " Line is " << *getA() << "*X + " <<
-      *getB() << "*Y = " << *getC() << "\n";
+    OS << " Line is " << *getA() << "*X + " << *getB() << "*Y = " << *getC()
+       << "\n";
   else
     llvm_unreachable("unknown constraint type in Constraint::dump");
 }
 #endif
 
-
 // Updates X with the intersection
 // of the Constraints X and Y. Returns true if X has changed.
 // Corresponds to Figure 4 from the paper
@@ -591,15 +570,14 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
       const SCEV *A1B2 = SE->getMulExpr(X->getA(), Y->getB());
       const SCEV *A2B1 = SE->getMulExpr(Y->getA(), X->getB());
       const SCEVConstant *C1A2_C2A1 =
-        dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1A2, C2A1));
+          dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1A2, C2A1));
       const SCEVConstant *C1B2_C2B1 =
-        dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1B2, C2B1));
+          dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1B2, C2B1));
       const SCEVConstant *A1B2_A2B1 =
-        dyn_cast<SCEVConstant>(SE->getMinusSCEV(A1B2, A2B1));
+          dyn_cast<SCEVConstant>(SE->getMinusSCEV(A1B2, A2B1));
       const SCEVConstant *A2B1_A1B2 =
-        dyn_cast<SCEVConstant>(SE->getMinusSCEV(A2B1, A1B2));
-      if (!C1B2_C2B1 || !C1A2_C2A1 ||
-          !A1B2_A2B1 || !A2B1_A1B2)
+          dyn_cast<SCEVConstant>(SE->getMinusSCEV(A2B1, A1B2));
+      if (!C1B2_C2B1 || !C1A2_C2A1 || !A1B2_A2B1 || !A2B1_A1B2)
         return false;
       APInt Xtop = C1B2_C2B1->getAPInt();
       APInt Xbot = A1B2_A2B1->getAPInt();
@@ -626,8 +604,8 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
         ++DeltaSuccesses;
         return true;
       }
-      if (const SCEVConstant *CUB =
-          collectConstantUpperBound(X->getAssociatedLoop(), Prod1->getType())) {
+      if (const SCEVConstant *CUB = collectConstantUpperBound(
+              X->getAssociatedLoop(), Prod1->getType())) {
         const APInt &UpperBound = CUB->getAPInt();
         LLVM_DEBUG(dbgs() << "\t\tupper bound = " << UpperBound << "\n");
         if (Xq.sgt(UpperBound) || Yq.sgt(UpperBound)) {
@@ -636,8 +614,7 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
           return true;
         }
       }
-      X->setPoint(SE->getConstant(Xq),
-                  SE->getConstant(Yq),
+      X->setPoint(SE->getConstant(Xq), SE->getConstant(Yq),
                   X->getAssociatedLoop());
       ++DeltaSuccesses;
       return true;
@@ -667,7 +644,6 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
   return false;
 }
 
-
 //===----------------------------------------------------------------------===//
 // DependenceInfo methods
 
@@ -737,8 +713,7 @@ void Dependence::dump(raw_ostream &OS) const {
 // tbaa, non-overlapping regions etc), then it is known there is no dependecy.
 // Otherwise the underlying objects are checked to see if they point to
 // different identifiable objects.
-static AliasResult underlyingObjectsAlias(AAResults *AA,
-                                          const DataLayout &DL,
+static AliasResult underlyingObjectsAlias(AAResults *AA, const DataLayout &DL,
                                           const MemoryLocation &LocA,
                                           const MemoryLocation &LocB) {
   // Check the original locations (minus size) for noalias, which can happen for
@@ -773,8 +748,7 @@ static AliasResult underlyingObjectsAlias(AAResults *AA,
 
 // Returns true if the load or store can be analyzed. Atomic and volatile
 // operations have properties which this analysis does not understand.
-static
-bool isLoadOrStore(const Instruction *I) {
+static bool isLoadOrStore(const Instruction *I) {
   if (const LoadInst *LI = dyn_cast<LoadInst>(I))
     return LI->isUnordered();
   else if (const StoreInst *SI = dyn_cast<StoreInst>(I))
@@ -782,7 +756,6 @@ bool isLoadOrStore(const Instruction *I) {
   return false;
 }
 
-
 // Examines the loop nesting of the Src and Dst
 // instructions and establishes their shared loops. Sets the variables
 // CommonLevels, SrcLevels, and MaxLevels.
@@ -860,14 +833,12 @@ void DependenceInfo::establishNestingLevels(const Instruction *Src,
   MaxLevels -= CommonLevels;
 }
 
-
 // Given one of the loops containing the source, return
 // its level index in our numbering scheme.
 unsigned DependenceInfo::mapSrcLoop(const Loop *SrcLoop) const {
   return SrcLoop->getLoopDepth();
 }
 
-
 // Given one of the loops containing the destination,
 // return its level index in our numbering scheme.
 unsigned DependenceInfo::mapDstLoop(const Loop *DstLoop) const {
@@ -880,7 +851,6 @@ unsigned DependenceInfo::mapDstLoop(const Loop *DstLoop) const {
     return D;
 }
 
-
 // Returns true if Expression is loop invariant in LoopNest.
 bool DependenceInfo::isLoopInvariant(const SCEV *Expression,
                                      const Loop *LoopNest) const {
@@ -896,8 +866,6 @@ bool DependenceInfo::isLoopInvariant(const SCEV *Expression,
   return SE->isLoopInvariant(Expression, LoopNest->getOutermostLoop());
 }
 
-
-
 // Finds the set of loops from the LoopNest that
 // have a level <= CommonLevels and are referred to by the SCEV Expression.
 void DependenceInfo::collectCommonLoops(const SCEV *Expression,
@@ -924,9 +892,9 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) {
     IntegerType *SrcTy = dyn_cast<IntegerType>(Src->getType());
     IntegerType *DstTy = dyn_cast<IntegerType>(Dst->getType());
     if (SrcTy == nullptr || DstTy == nullptr) {
-      assert(SrcTy == DstTy && "This function only unify integer types and "
-             "expect Src and Dst share the same type "
-             "otherwise.");
+      assert(SrcTy == DstTy &&
+             "This function only unify integer types and "
+             "expect Src and Dst share the same type otherwise.");
       continue;
     }
     if (SrcTy->getBitWidth() > widestWidthSeen) {
@@ -939,7 +907,6 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) {
     }
   }
 
-
   assert(widestWidthSeen > 0);
 
   // Now extend each pair to the widest seen.
@@ -949,9 +916,9 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) {
     IntegerType *SrcTy = dyn_cast<IntegerType>(Src->getType());
     IntegerType *DstTy = dyn_cast<IntegerType>(Dst->getType());
     if (SrcTy == nullptr || DstTy == nullptr) {
-      assert(SrcTy == DstTy && "This function only unify integer types and "
-             "expect Src and Dst share the same type "
-             "otherwise.");
+      assert(SrcTy == DstTy &&
+             "This function only unify integer types and "
+             "expect Src and Dst share the same type otherwise.");
       continue;
     }
     if (SrcTy->getBitWidth() < widestWidthSeen)
@@ -1028,7 +995,6 @@ bool DependenceInfo::checkDstSubscript(const SCEV *Dst, const Loop *LoopNest,
   return checkSubscript(Dst, LoopNest, Loops, false);
 }
 
-
 // Examines the subscript pair (the Src and Dst SCEVs)
 // and classifies it as either ZIV, SIV, RDIV, MIV, or Nonlinear.
 // Collects the associated loops in a set.
@@ -1049,14 +1015,12 @@ DependenceInfo::classifyPair(const SCEV *Src, const Loop *SrcLoopNest,
     return Subscript::ZIV;
   if (N == 1)
     return Subscript::SIV;
-  if (N == 2 && (SrcLoops.count() == 0 ||
-                 DstLoops.count() == 0 ||
+  if (N == 2 && (SrcLoops.count() == 0 || DstLoops.count() == 0 ||
                  (SrcLoops.count() == 1 && DstLoops.count() == 1)))
     return Subscript::RDIV;
   return Subscript::MIV;
 }
 
-
 // A wrapper around SCEV::isKnownPredicate.
 // Looks for cases where we're interested in comparing for equality.
 // If both X and Y have been identically sign or zero extended,
@@ -1069,12 +1033,9 @@ DependenceInfo::classifyPair(const SCEV *Src, const Loop *SrcLoopNest,
 // involving symbolics.
 bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X,
                                       const SCEV *Y) const {
-  if (Pred == CmpInst::ICMP_EQ ||
-      Pred == CmpInst::ICMP_NE) {
-    if ((isa<SCEVSignExtendExpr>(X) &&
-         isa<SCEVSignExtendExpr>(Y)) ||
-        (isa<SCEVZeroExtendExpr>(X) &&
-         isa<SCEVZeroExtendExpr>(Y))) {
+  if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) {
+    if ((isa<SCEVSignExtendExpr>(X) && isa<SCEVSignExtendExpr>(Y)) ||
+        (isa<SCEVZeroExtendExpr>(X) && isa<SCEVZeroExtendExpr>(Y))) {
       const SCEVIntegralCastExpr *CX = cast<SCEVIntegralCastExpr>(X);
       const SCEVIntegralCastExpr *CY = cast<SCEVIntegralCastExpr>(Y);
       const SCEV *Xop = CX->getOperand();
@@ -1111,7 +1072,10 @@ bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X,
   }
 }
 
-/// Compare to see if S is less than Size, using isKnownNegative(S - max(Size, 1))
+/// Compare to see if S is less than Size, using
+///
+///    isKnownNegative(S - max(Size, 1))
+///
 /// with some extra checking if S is an AddRec and we can prove less-than using
 /// the loop bounds.
 bool DependenceInfo::isKnownLessThan(const SCEV *S, const SCEV *Size) const {
@@ -1178,7 +1142,6 @@ const SCEV *DependenceInfo::collectUpperBound(const Loop *L, Type *T) const {
   return nullptr;
 }
 
-
 // Calls collectUpperBound(), then attempts to cast it to SCEVConstant.
 // If the cast fails, returns NULL.
 const SCEVConstant *DependenceInfo::collectConstantUpperBound(const Loop *L,
@@ -1188,7 +1151,6 @@ const SCEVConstant *DependenceInfo::collectConstantUpperBound(const Loop *L,
   return nullptr;
 }
 
-
 // testZIV -
 // When we have a pair of subscripts of the form [c1] and [c2],
 // where c1 and c2 are both loop invariant, we attack it using
@@ -1218,7 +1180,6 @@ bool DependenceInfo::testZIV(const SCEV *Src, const SCEV *Dst,
   return false; // possibly dependent
 }
 
-
 // strongSIVtest -
 // From the paper, Practical Dependence Testing, Section 4.2.1
 //
@@ -1270,9 +1231,9 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
     LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound);
     LLVM_DEBUG(dbgs() << ", " << *UpperBound->getType() << "\n");
     const SCEV *AbsDelta =
-      SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta);
+        SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta);
     const SCEV *AbsCoeff =
-      SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff);
+        SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff);
     const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff);
     if (isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product)) {
       // Distance greater than trip count - no dependence
@@ -1286,7 +1247,7 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
   if (isa<SCEVConstant>(Delta) && isa<SCEVConstant>(Coeff)) {
     APInt ConstDelta = cast<SCEVConstant>(Delta)->getAPInt();
     APInt ConstCoeff = cast<SCEVConstant>(Coeff)->getAPInt();
-    APInt Distance  = ConstDelta; // these need to be initialized
+    APInt Distance = ConstDelta; // these need to be initialized
     APInt Remainder = ConstDelta;
     APInt::sdivrem(ConstDelta, ConstCoeff, Distance, Remainder);
     LLVM_DEBUG(dbgs() << "\t    Distance = " << Distance << "\n");
@@ -1307,29 +1268,25 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
     else
       Result.DV[Level].Direction &= Dependence::DVEntry::EQ;
     ++StrongSIVsuccesses;
-  }
-  else if (Delta->isZero()) {
+  } else if (Delta->isZero()) {
     // since 0/X == 0
     Result.DV[Level].Distance = Delta;
     NewConstraint.setDistance(Delta, CurLoop);
     Result.DV[Level].Direction &= Dependence::DVEntry::EQ;
     ++StrongSIVsuccesses;
-  }
-  else {
+  } else {
     if (Coeff->isOne()) {
       LLVM_DEBUG(dbgs() << "\t    Distance = " << *Delta << "\n");
       Result.DV[Level].Distance = Delta; // since X/1 == X
       NewConstraint.setDistance(Delta, CurLoop);
-    }
-    else {
+    } else {
       Result.Consistent = false;
-      NewConstraint.setLine(Coeff,
-                            SE->getNegativeSCEV(Coeff),
+      NewConstraint.setLine(Coeff, SE->getNegativeSCEV(Coeff),
                             SE->getNegativeSCEV(Delta), CurLoop);
     }
 
     // maybe we can get a useful direction
-    bool DeltaMaybeZero     = !SE->isKnownNonZero(Delta);
+    bool DeltaMaybeZero = !SE->isKnownNonZero(Delta);
     bool DeltaMaybePositive = !SE->isKnownNonPositive(Delta);
     bool DeltaMaybeNegative = !SE->isKnownNonNegative(Delta);
     bool CoeffMaybePositive = !SE->isKnownNonPositive(Coeff);
@@ -1353,7 +1310,6 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
   return false;
 }
 
-
 // weakCrossingSIVtest -
 // From the paper, Practical Dependence Testing, Section 4.2.2
 //
@@ -1447,8 +1403,8 @@ bool DependenceInfo::weakCrossingSIVtest(
   if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
     LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
     const SCEV *ConstantTwo = SE->getConstant(UpperBound->getType(), 2);
-    const SCEV *ML = SE->getMulExpr(SE->getMulExpr(ConstCoeff, UpperBound),
-                                    ConstantTwo);
+    const SCEV *ML =
+        SE->getMulExpr(SE->getMulExpr(ConstCoeff, UpperBound), ConstantTwo);
     LLVM_DEBUG(dbgs() << "\t    ML = " << *ML << "\n");
     if (isKnownPredicate(CmpInst::ICMP_SGT, Delta, ML)) {
       // Delta too big, no dependence
@@ -1498,7 +1454,6 @@ bool DependenceInfo::weakCrossingSIVtest(
   return false;
 }
 
-
 // Kirch's algorithm, from
 //
 //        Optimizing Supercompilers for Supercomputers
@@ -1519,9 +1474,11 @@ static bool findGCD(unsigned Bits, const APInt &AM, const APInt &BM,
   APInt R = G0;
   APInt::sdivrem(G0, G1, Q, R);
   while (R != 0) {
+    // clang-format off
     APInt A2 = A0 - Q*A1; A0 = A1; A1 = A2;
     APInt B2 = B0 - Q*B1; B0 = B1; B1 = B2;
     G0 = G1; G1 = R;
+    // clang-format on
     APInt::sdivrem(G0, G1, Q, R);
   }
   G = G1;
@@ -1543,8 +1500,7 @@ static APInt floorOfQuotient(const APInt &A, const APInt &B) {
   APInt::sdivrem(A, B, Q, R);
   if (R == 0)
     return Q;
-  if ((A.sgt(0) && B.sgt(0)) ||
-      (A.slt(0) && B.slt(0)))
+  if ((A.sgt(0) && B.sgt(0)) || (A.slt(0) && B.slt(0)))
     return Q;
   else
     return Q - 1;
@@ -1556,8 +1512,7 @@ static APInt ceilingOfQuotient(const APInt &A, const APInt &B) {
   APInt::sdivrem(A, B, Q, R);
   if (R == 0)
     return Q;
-  if ((A.sgt(0) && B.sgt(0)) ||
-      (A.slt(0) && B.slt(0)))
+  if ((A.sgt(0) && B.sgt(0)) || (A.slt(0) && B.slt(0)))
     return Q + 1;
   else
     return Q;
@@ -1733,17 +1688,14 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   return Result.DV[Level].Direction == Dependence::DVEntry::NONE;
 }
 
-
 // Return true if the divisor evenly divides the dividend.
-static
-bool isRemainderZero(const SCEVConstant *Dividend,
-                     const SCEVConstant *Divisor) {
+static bool isRemainderZero(const SCEVConstant *Dividend,
+                            const SCEVConstant *Divisor) {
   const APInt &ConstDividend = Dividend->getAPInt();
   const APInt &ConstDivisor = Divisor->getAPInt();
   return ConstDividend.srem(ConstDivisor) == 0;
 }
 
-
 // weakZeroSrcSIVtest -
 // From the paper, Practical Dependence Testing, Section 4.2.2
 //
@@ -1807,11 +1759,11 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
   const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(DstCoeff);
   if (!ConstCoeff)
     return false;
-  const SCEV *AbsCoeff =
-    SE->isKnownNegative(ConstCoeff) ?
-    SE->getNegativeSCEV(ConstCoeff) : ConstCoeff;
+  const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)
+                             ? SE->getNegativeSCEV(ConstCoeff)
+                             : ConstCoeff;
   const SCEV *NewDelta =
-    SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
+      SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
 
   // check that Delta/SrcCoeff < iteration count
   // really check NewDelta < count*AbsCoeff
@@ -1853,7 +1805,6 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
   return false;
 }
 
-
 // weakZeroDstSIVtest -
 // From the paper, Practical Dependence Testing, Section 4.2.2
 //
@@ -1916,11 +1867,11 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff,
   const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(SrcCoeff);
   if (!ConstCoeff)
     return false;
-  const SCEV *AbsCoeff =
-    SE->isKnownNegative(ConstCoeff) ?
-    SE->getNegativeSCEV(ConstCoeff) : ConstCoeff;
+  const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)
+                             ? SE->getNegativeSCEV(ConstCoeff)
+                             : ConstCoeff;
   const SCEV *NewDelta =
-    SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
+      SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
 
   // check that Delta/SrcCoeff < iteration count
   // really check NewDelta < count*AbsCoeff
@@ -1962,7 +1913,6 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff,
   return false;
 }
 
-
 // exactRDIVtest - Tests the RDIV subscript pair for dependence.
 // Things of the form [c1 + a*i] and [c2 + b*j],
 // where i and j are induction variable, c1 and c2 are loop invariant,
@@ -2084,7 +2034,6 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   return TL.sgt(TU);
 }
 
-
 // symbolicRDIVtest -
 // In Section 4.5 of the Practical Dependence Testing paper,the authors
 // introduce a special case of Banerjee's Inequalities (also called the
@@ -2167,8 +2116,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
           return true;
         }
       }
-    }
-    else if (SE->isKnownNonPositive(A2)) {
+    } else if (SE->isKnownNonPositive(A2)) {
       // a1 >= 0 && a2 <= 0
       if (N1 && N2) {
         // make sure that c2 - c1 <= a1*N1 - a2*N2
@@ -2187,8 +2135,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
         return true;
       }
     }
-  }
-  else if (SE->isKnownNonPositive(A1)) {
+  } else if (SE->isKnownNonPositive(A1)) {
     if (SE->isKnownNonNegative(A2)) {
       // a1 <= 0 && a2 >= 0
       if (N1 && N2) {
@@ -2207,8 +2154,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
         ++SymbolicRDIVindependence;
         return true;
       }
-    }
-    else if (SE->isKnownNonPositive(A2)) {
+    } else if (SE->isKnownNonPositive(A2)) {
       // a1 <= 0 && a2 <= 0
       if (N1) {
         // make sure that a1*N1 <= c2 - c1
@@ -2233,7 +2179,6 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
   return false;
 }
 
-
 // testSIV -
 // When we have a pair of subscripts of the form [c1 + a1*i] and [c2 - a2*i]
 // where i is an induction variable, c1 and c2 are loop invariant, and a1 and
@@ -2260,17 +2205,17 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
     Level = mapSrcLoop(CurLoop);
     bool disproven;
     if (SrcCoeff == DstCoeff)
-      disproven = strongSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
-                                Level, Result, NewConstraint);
+      disproven = strongSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, Level,
+                                Result, NewConstraint);
     else if (SrcCoeff == SE->getNegativeSCEV(DstCoeff))
       disproven = weakCrossingSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
                                       Level, Result, NewConstraint, SplitIter);
     else
       disproven = exactSIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop,
                                Level, Result, NewConstraint);
-    return disproven ||
-      gcdMIVtest(Src, Dst, Result) ||
-      symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop, CurLoop);
+    return disproven || gcdMIVtest(Src, Dst, Result) ||
+           symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop,
+                            CurLoop);
   }
   if (SrcAddRec) {
     const SCEV *SrcConst = SrcAddRec->getStart();
@@ -2278,9 +2223,9 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
     const SCEV *DstConst = Dst;
     const Loop *CurLoop = SrcAddRec->getLoop();
     Level = mapSrcLoop(CurLoop);
-    return weakZeroDstSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
-                              Level, Result, NewConstraint) ||
-      gcdMIVtest(Src, Dst, Result);
+    return weakZeroDstSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, Level,
+                              Result, NewConstraint) ||
+           gcdMIVtest(Src, Dst, Result);
   }
   if (DstAddRec) {
     const SCEV *DstConst = DstAddRec->getStart();
@@ -2288,15 +2233,14 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
     const SCEV *SrcConst = Src;
     const Loop *CurLoop = DstAddRec->getLoop();
     Level = mapDstLoop(CurLoop);
-    return weakZeroSrcSIVtest(DstCoeff, SrcConst, DstConst,
-                              CurLoop, Level, Result, NewConstraint) ||
-      gcdMIVtest(Src, Dst, Result);
+    return weakZeroSrcSIVtest(DstCoeff, SrcConst, DstConst, CurLoop, Level,
+                              Result, NewConstraint) ||
+           gcdMIVtest(Src, Dst, Result);
   }
   llvm_unreachable("SIV test expected at least one AddRec");
   return false;
 }
 
-
 // testRDIV -
 // When we have a pair of subscripts of the form [c1 + a1*i] and [c2 + a2*j]
 // where i and j are induction variables, c1 and c2 are loop invariant,
@@ -2333,46 +2277,37 @@ bool DependenceInfo::testRDIV(const SCEV *Src, const SCEV *Dst,
     DstConst = DstAddRec->getStart();
     DstCoeff = DstAddRec->getStepRecurrence(*SE);
     DstLoop = DstAddRec->getLoop();
-  }
-  else if (SrcAddRec) {
+  } else if (SrcAddRec) {
     if (const SCEVAddRecExpr *tmpAddRec =
-        dyn_cast<SCEVAddRecExpr>(SrcAddRec->getStart())) {
+            dyn_cast<SCEVAddRecExpr>(SrcAddRec->getStart())) {
       SrcConst = tmpAddRec->getStart();
       SrcCoeff = tmpAddRec->getStepRecurrence(*SE);
       SrcLoop = tmpAddRec->getLoop();
       DstConst = Dst;
       DstCoeff = SE->getNegativeSCEV(SrcAddRec->getStepRecurrence(*SE));
       DstLoop = SrcAddRec->getLoop();
-    }
-    else
+    } else
       llvm_unreachable("RDIV reached by surprising SCEVs");
-  }
-  else if (DstAddRec) {
+  } else if (DstAddRec) {
     if (const SCEVAddRecExpr *tmpAddRec =
-        dyn_cast<SCEVAddRecExpr>(DstAddRec->getStart())) {
+            dyn_cast<SCEVAddRecExpr>(DstAddRec->getStart())) {
       DstConst = tmpAddRec->getStart();
       DstCoeff = tmpAddRec->getStepRecurrence(*SE);
       DstLoop = tmpAddRec->getLoop();
       SrcConst = Src;
       SrcCoeff = SE->getNegativeSCEV(DstAddRec->getStepRecurrence(*SE));
       SrcLoop = DstAddRec->getLoop();
-    }
-    else
+    } else
       llvm_unreachable("RDIV reached by surprising SCEVs");
-  }
-  else
+  } else
     llvm_unreachable("RDIV expected at least one AddRec");
-  return exactRDIVtest(SrcCoeff, DstCoeff,
-                       SrcConst, DstConst,
-                       SrcLoop, DstLoop,
+  return exactRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, SrcLoop, DstLoop,
                        Result) ||
-    gcdMIVtest(Src, Dst, Result) ||
-    symbolicRDIVtest(SrcCoeff, DstCoeff,
-                     SrcConst, DstConst,
-                     SrcLoop, DstLoop);
+         gcdMIVtest(Src, Dst, Result) ||
+         symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, SrcLoop,
+                          DstLoop);
 }
 
-
 // Tests the single-subscript MIV pair (Src and Dst) for dependence.
 // Return true if dependence disproved.
 // Can sometimes refine direction vectors.
@@ -2383,7 +2318,7 @@ bool DependenceInfo::testMIV(const SCEV *Src, const SCEV *Dst,
   LLVM_DEBUG(dbgs() << "    dst = " << *Dst << "\n");
   Result.Consistent = false;
   return gcdMIVtest(Src, Dst, Result) ||
-    banerjeeMIVtest(Src, Dst, Loops, Result);
+         banerjeeMIVtest(Src, Dst, Loops, Result);
 }
 
 // Given a product, e.g., 10*X*Y, returns the first constant operand,
@@ -2428,7 +2363,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
   // we can't quit the loop just because the GCD == 1.
   const SCEV *Coefficients = Src;
   while (const SCEVAddRecExpr *AddRec =
-         dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+             dyn_cast<SCEVAddRecExpr>(Coefficients)) {
     const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
     // If the coefficient is the product of a constant and other stuff,
     // we can use the constant in the GCD computation.
@@ -2446,7 +2381,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
   // we can't quit the loop just because the GCD == 1.
   Coefficients = Dst;
   while (const SCEVAddRecExpr *AddRec =
-         dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+             dyn_cast<SCEVAddRecExpr>(Coefficients)) {
     const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
     // If the coefficient is the product of a constant and other stuff,
     // we can use the constant in the GCD computation.
@@ -2468,16 +2403,14 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
       if (isa<SCEVConstant>(Operand)) {
         assert(!Constant && "Surprised to find multiple constants");
         Constant = cast<SCEVConstant>(Operand);
-      }
-      else if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Operand)) {
+      } else if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Operand)) {
         // Search for constant operand to participate in GCD;
         // If none found; return false.
         std::optional<APInt> ConstOp = getConstantPart(Product);
         if (!ConstOp)
           return false;
         ExtraGCD = APIntOps::GreatestCommonDivisor(ExtraGCD, ConstOp->abs());
-      }
-      else
+      } else
         return false;
     }
   }
@@ -2512,7 +2445,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
   bool Improved = false;
   Coefficients = Src;
   while (const SCEVAddRecExpr *AddRec =
-         dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+             dyn_cast<SCEVAddRecExpr>(Coefficients)) {
     Coefficients = AddRec->getStart();
     const Loop *CurLoop = AddRec->getLoop();
     RunningGCD = ExtraGCD;
@@ -2578,7 +2511,6 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
   return false;
 }
 
-
 //===----------------------------------------------------------------------===//
 // banerjeeMIVtest -
 // Use Banerjee's Inequalities to test an MIV subscript pair.
@@ -2652,8 +2584,8 @@ bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst,
   if (testBounds(Dependence::DVEntry::ALL, 0, Bound, Delta)) {
     // Explore the direction vector hierarchy.
     unsigned DepthExpanded = 0;
-    unsigned NewDeps = exploreDirections(1, A, B, Bound,
-                                         Loops, DepthExpanded, Delta);
+    unsigned NewDeps =
+        exploreDirections(1, A, B, Bound, Loops, DepthExpanded, Delta);
     if (NewDeps > 0) {
       bool Improved = false;
       for (unsigned K = 1; K <= CommonLevels; ++K) {
@@ -2670,23 +2602,20 @@ bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst,
       }
       if (Improved)
         ++BanerjeeSuccesses;
-    }
-    else {
+    } else {
       ++BanerjeeIndependence;
       Disproved = true;
     }
-  }
-  else {
+  } else {
     ++BanerjeeIndependence;
     Disproved = true;
   }
-  delete [] Bound;
-  delete [] A;
-  delete [] B;
+  delete[] Bound;
+  delete[] A;
+  delete[] B;
   return Disproved;
 }
 
-
 // Hierarchically expands the direction vector
 // search space, combining the directions of discovered dependences
 // in the DirSet field of Bound. Returns the number of distinct
@@ -2788,27 +2717,26 @@ unsigned DependenceInfo::exploreDirections(unsigned Level, CoefficientInfo *A,
 
     // test bounds for <, *, *, ...
     if (testBounds(Dependence::DVEntry::LT, Level, Bound, Delta))
-      NewDeps += exploreDirections(Level + 1, A, B, Bound,
-                                   Loops, DepthExpanded, Delta);
+      NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded,
+                                   Delta);
 
     // Test bounds for =, *, *, ...
     if (testBounds(Dependence::DVEntry::EQ, Level, Bound, Delta))
-      NewDeps += exploreDirections(Level + 1, A, B, Bound,
-                                   Loops, DepthExpanded, Delta);
+      NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded,
+                                   Delta);
 
     // test bounds for >, *, *, ...
     if (testBounds(Dependence::DVEntry::GT, Level, Bound, Delta))
-      NewDeps += exploreDirections(Level + 1, A, B, Bound,
-                                   Loops, DepthExpanded, Delta);
+      NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded,
+                                   Delta);
 
     Bound[Level].Direction = Dependence::DVEntry::ALL;
     return NewDeps;
-  }
-  else
-    return exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded, Delta);
+  } else
+    return exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded,
+                             Delta);
 }
 
-
 // Returns true iff the current bounds are plausible.
 bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level,
                                 BoundInfo *Bound, const SCEV *Delta) const {
@@ -2822,7 +2750,6 @@ bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level,
   return true;
 }
 
-
 // Computes the upper and lower bounds for level K
 // using the * direction. Records them in Bound.
 // Wolfe gives the equations
@@ -2840,17 +2767,16 @@ bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level,
 // and the upper bound is always >= 0.
 void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B,
                                    BoundInfo *Bound, unsigned K) const {
-  Bound[K].Lower[Dependence::DVEntry::ALL] = nullptr; // Default value = -infinity.
-  Bound[K].Upper[Dependence::DVEntry::ALL] = nullptr; // Default value = +infinity.
+  Bound[K].Lower[Dependence::DVEntry::ALL] =
+      nullptr; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::ALL] =
+      nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
-    Bound[K].Lower[Dependence::DVEntry::ALL] =
-      SE->getMulExpr(SE->getMinusSCEV(A[K].NegPart, B[K].PosPart),
-                     Bound[K].Iterations);
-    Bound[K].Upper[Dependence::DVEntry::ALL] =
-      SE->getMulExpr(SE->getMinusSCEV(A[K].PosPart, B[K].NegPart),
-                     Bound[K].Iterations);
-  }
-  else {
+    Bound[K].Lower[Dependence::DVEntry::ALL] = SE->getMulExpr(
+        SE->getMinusSCEV(A[K].NegPart, B[K].PosPart), Bound[K].Iterations);
+    Bound[K].Upper[Dependence::DVEntry::ALL] = SE->getMulExpr(
+        SE->getMinusSCEV(A[K].PosPart, B[K].NegPart), Bound[K].Iterations);
+  } else {
     // If the difference is 0, we won't need to know the number of iterations.
     if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].NegPart, B[K].PosPart))
       Bound[K].Lower[Dependence::DVEntry::ALL] =
@@ -2861,7 +2787,6 @@ void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B,
   }
 }
 
-
 // Computes the upper and lower bounds for level K
 // using the = direction. Records them in Bound.
 // Wolfe gives the equations
@@ -2879,18 +2804,19 @@ void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B,
 // and the upper bound is always >= 0.
 void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B,
                                   BoundInfo *Bound, unsigned K) const {
-  Bound[K].Lower[Dependence::DVEntry::EQ] = nullptr; // Default value = -infinity.
-  Bound[K].Upper[Dependence::DVEntry::EQ] = nullptr; // Default value = +infinity.
+  Bound[K].Lower[Dependence::DVEntry::EQ] =
+      nullptr; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::EQ] =
+      nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
     const SCEV *Delta = SE->getMinusSCEV(A[K].Coeff, B[K].Coeff);
     const SCEV *NegativePart = getNegativePart(Delta);
     Bound[K].Lower[Dependence::DVEntry::EQ] =
-      SE->getMulExpr(NegativePart, Bound[K].Iterations);
+        SE->getMulExpr(NegativePart, Bound[K].Iterations);
     const SCEV *PositivePart = getPositivePart(Delta);
     Bound[K].Upper[Dependence::DVEntry::EQ] =
-      SE->getMulExpr(PositivePart, Bound[K].Iterations);
-  }
-  else {
+        SE->getMulExpr(PositivePart, Bound[K].Iterations);
+  } else {
     // If the positive/negative part of the difference is 0,
     // we won't need to know the number of iterations.
     const SCEV *Delta = SE->getMinusSCEV(A[K].Coeff, B[K].Coeff);
@@ -2903,7 +2829,6 @@ void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B,
   }
 }
 
-
 // Computes the upper and lower bounds for level K
 // using the < direction. Records them in Bound.
 // Wolfe gives the equations
@@ -2919,35 +2844,35 @@ void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B,
 // We must be careful to handle the case where the upper bound is unknown.
 void DependenceInfo::findBoundsLT(CoefficientInfo *A, CoefficientInfo *B,
                                   BoundInfo *Bound, unsigned K) const {
-  Bound[K].Lower[Dependence::DVEntry::LT] = nullptr; // Default value = -infinity.
-  Bound[K].Upper[Dependence::DVEntry::LT] = nullptr; // Default value = +infinity.
+  Bound[K].Lower[Dependence::DVEntry::LT] =
+      nullptr; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::LT] =
+      nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
     const SCEV *Iter_1 = SE->getMinusSCEV(
         Bound[K].Iterations, SE->getOne(Bound[K].Iterations->getType()));
     const SCEV *NegPart =
-      getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
+        getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
     Bound[K].Lower[Dependence::DVEntry::LT] =
-      SE->getMinusSCEV(SE->getMulExpr(NegPart, Iter_1), B[K].Coeff);
+        SE->getMinusSCEV(SE->getMulExpr(NegPart, Iter_1), B[K].Coeff);
     const SCEV *PosPart =
-      getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
+        getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
     Bound[K].Upper[Dependence::DVEntry::LT] =
-      SE->getMinusSCEV(SE->getMulExpr(PosPart, Iter_1), B[K].Coeff);
-  }
-  else {
+        SE->getMinusSCEV(SE->getMulExpr(PosPart, Iter_1), B[K].Coeff);
+  } else {
     // If the positive/negative part of the difference is 0,
     // we won't need to know the number of iterations.
     const SCEV *NegPart =
-      getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
+        getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
     if (NegPart->isZero())
       Bound[K].Lower[Dependence::DVEntry::LT] = SE->getNegativeSCEV(B[K].Coeff);
     const SCEV *PosPart =
-      getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
+        getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
     if (PosPart->isZero())
       Bound[K].Upper[Dependence::DVEntry::LT] = SE->getNegativeSCEV(B[K].Coeff);
   }
 }
 
-
 // Computes the upper and lower bounds for level K
 // using the > direction. Records them in Bound.
 // Wolfe gives the equations
@@ -2963,45 +2888,45 @@ void DependenceInfo::findBoundsLT(CoefficientInfo *A, CoefficientInfo *B,
 // We must be careful to handle the case where the upper bound is unknown.
 void DependenceInfo::findBoundsGT(CoefficientInfo *A, CoefficientInfo *B,
                                   BoundInfo *Bound, unsigned K) const {
-  Bound[K].Lower[Dependence::DVEntry::GT] = nullptr; // Default value = -infinity.
-  Bound[K].Upper[Dependence::DVEntry::GT] = nullptr; // Default value = +infinity.
+  Bound[K].Lower[Dependence::DVEntry::GT] =
+      nullptr; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::GT] =
+      nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
     const SCEV *Iter_1 = SE->getMinusSCEV(
         Bound[K].Iterations, SE->getOne(Bound[K].Iterations->getType()));
     const SCEV *NegPart =
-      getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
+        getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
     Bound[K].Lower[Dependence::DVEntry::GT] =
-      SE->getAddExpr(SE->getMulExpr(NegPart, Iter_1), A[K].Coeff);
+        SE->getAddExpr(SE->getMulExpr(NegPart, Iter_1), A[K].Coeff);
     const SCEV *PosPart =
-      getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
+        getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
     Bound[K].Upper[Dependence::DVEntry::GT] =
-      SE->getAddExpr(SE->getMulExpr(PosPart, Iter_1), A[K].Coeff);
-  }
-  else {
+        SE->getAddExpr(SE->getMulExpr(PosPart, Iter_1), A[K].Coeff);
+  } else {
     // If the positive/negative part of the difference is 0,
     // we won't need to know the number of iterations.
-    const SCEV *NegPart = getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
+    const SCEV *NegPart =
+        getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
     if (NegPart->isZero())
       Bound[K].Lower[Dependence::DVEntry::GT] = A[K].Coeff;
-    const SCEV *PosPart = getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
+    const SCEV *PosPart =
+        getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
     if (PosPart->isZero())
       Bound[K].Upper[Dependence::DVEntry::GT] = A[K].Coeff;
   }
 }
 
-
 // X^+ = max(X, 0)
 const SCEV *DependenceInfo::getPositivePart(const SCEV *X) const {
   return SE->getSMaxExpr(X, SE->getZero(X->getType()));
 }
 
-
 // X^- = min(X, 0)
 const SCEV *DependenceInfo::getNegativePart(const SCEV *X) const {
   return SE->getSMinExpr(X, SE->getZero(X->getType()));
 }
 
-
 // Walks through the subscript,
 // collecting each coefficient, the associated loop bounds,
 // and recording its positive and negative parts for later use.
@@ -3046,7 +2971,6 @@ DependenceInfo::collectCoeffInfo(const SCEV *Subscript, bool SrcFlag,
   return CI;
 }
 
-
 // Looks through all the bounds info and
 // computes the lower bound given the current direction settings
 // at each level. If the lower bound for any level is -inf,
@@ -3062,7 +2986,6 @@ const SCEV *DependenceInfo::getLowerBound(BoundInfo *Bound) const {
   return Sum;
 }
 
-
 // Looks through all the bounds info and
 // computes the upper bound given the current direction settings
 // at each level. If the upper bound at any level is +inf,
@@ -3078,7 +3001,6 @@ const SCEV *DependenceInfo::getUpperBound(BoundInfo *Bound) const {
   return Sum;
 }
 
-
 //===----------------------------------------------------------------------===//
 // Constraint manipulation for Delta test.
 
@@ -3098,7 +3020,6 @@ const SCEV *DependenceInfo::findCoefficient(const SCEV *Expr,
   return findCoefficient(AddRec->getStart(), TargetLoop);
 }
 
-
 // Given a linear SCEV,
 // return the SCEV given by zeroing out the coefficient
 // corresponding to the specified loop.
@@ -3112,12 +3033,10 @@ const SCEV *DependenceInfo::zeroCoefficient(const SCEV *Expr,
   if (AddRec->getLoop() == TargetLoop)
     return AddRec->getStart();
   return SE->getAddRecExpr(zeroCoefficient(AddRec->getStart(), TargetLoop),
-                           AddRec->getStepRecurrence(*SE),
-                           AddRec->getLoop(),
+                           AddRec->getStepRecurrence(*SE), AddRec->getLoop(),
                            AddRec->getNoWrapFlags());
 }
 
-
 // Given a linear SCEV Expr,
 // return the SCEV given by adding some Value to the
 // coefficient corresponding to the specified TargetLoop.
@@ -3128,17 +3047,13 @@ const SCEV *DependenceInfo::addToCoefficient(const SCEV *Expr,
                                              const SCEV *Value) const {
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
   if (!AddRec) // create a new addRec
-    return SE->getAddRecExpr(Expr,
-                             Value,
-                             TargetLoop,
+    return SE->getAddRecExpr(Expr, Value, TargetLoop,
                              SCEV::FlagAnyWrap); // Worst case, with no info.
   if (AddRec->getLoop() == TargetLoop) {
     const SCEV *Sum = SE->getAddExpr(AddRec->getStepRecurrence(*SE), Value);
     if (Sum->isZero())
       return AddRec->getStart();
-    return SE->getAddRecExpr(AddRec->getStart(),
-                             Sum,
-                             AddRec->getLoop(),
+    return SE->getAddRecExpr(AddRec->getStart(), Sum, AddRec->getLoop(),
                              AddRec->getNoWrapFlags());
   }
   if (SE->isLoopInvariant(AddRec, TargetLoop))
@@ -3149,7 +3064,6 @@ const SCEV *DependenceInfo::addToCoefficient(const SCEV *Expr,
       AddRec->getNoWrapFlags());
 }
 
-
 // Review the constraints, looking for opportunities
 // to simplify a subscript pair (Src and Dst).
 // Return true if some simplification occurs.
@@ -3178,7 +3092,6 @@ bool DependenceInfo::propagate(const SCEV *&Src, const SCEV *&Dst,
   return Result;
 }
 
-
 // Attempt to propagate a distance
 // constraint into a subscript pair (Src and Dst).
 // Return true if some simplification occurs.
@@ -3204,7 +3117,6 @@ bool DependenceInfo::propagateDistance(const SCEV *&Src, const SCEV *&Dst,
   return true;
 }
 
-
 // Attempt to propagate a line
 // constraint into a subscript pair (Src and Dst).
 // Return true if some simplification occurs.
@@ -3224,22 +3136,22 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
   if (A->isZero()) {
     const SCEVConstant *Bconst = dyn_cast<SCEVConstant>(B);
     const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
-    if (!Bconst || !Cconst) return false;
+    if (!Bconst || !Cconst)
+      return false;
     APInt Beta = Bconst->getAPInt();
     APInt Charlie = Cconst->getAPInt();
     APInt CdivB = Charlie.sdiv(Beta);
     assert(Charlie.srem(Beta) == 0 && "C should be evenly divisible by B");
     const SCEV *AP_K = findCoefficient(Dst, CurLoop);
-    //    Src = SE->getAddExpr(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB)));
     Src = SE->getMinusSCEV(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB)));
     Dst = zeroCoefficient(Dst, CurLoop);
     if (!findCoefficient(Src, CurLoop)->isZero())
       Consistent = false;
-  }
-  else if (B->isZero()) {
+  } else if (B->isZero()) {
     const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A);
     const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
-    if (!Aconst || !Cconst) return false;
+    if (!Aconst || !Cconst)
+      return false;
     APInt Alpha = Aconst->getAPInt();
     APInt Charlie = Cconst->getAPInt();
     APInt CdivA = Charlie.sdiv(Alpha);
@@ -3249,11 +3161,11 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
     Src = zeroCoefficient(Src, CurLoop);
     if (!findCoefficient(Dst, CurLoop)->isZero())
       Consistent = false;
-  }
-  else if (isKnownPredicate(CmpInst::ICMP_EQ, A, B)) {
+  } else if (isKnownPredicate(CmpInst::ICMP_EQ, A, B)) {
     const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A);
     const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
-    if (!Aconst || !Cconst) return false;
+    if (!Aconst || !Cconst)
+      return false;
     APInt Alpha = Aconst->getAPInt();
     APInt Charlie = Cconst->getAPInt();
     APInt CdivA = Charlie.sdiv(Alpha);
@@ -3264,8 +3176,7 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
     Dst = addToCoefficient(Dst, CurLoop, A_K);
     if (!findCoefficient(Dst, CurLoop)->isZero())
       Consistent = false;
-  }
-  else {
+  } else {
     // paper is incorrect here, or perhaps just misleading
     const SCEV *A_K = findCoefficient(Src, CurLoop);
     Src = SE->getMulExpr(Src, A);
@@ -3281,7 +3192,6 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
   return true;
 }
 
-
 // Attempt to propagate a point
 // constraint into a subscript pair (Src and Dst).
 // Return true if some simplification occurs.
@@ -3302,7 +3212,6 @@ bool DependenceInfo::propagatePoint(const SCEV *&Src, const SCEV *&Dst,
   return true;
 }
 
-
 // Update direction vector entry based on the current constraint.
 void DependenceInfo::updateDirection(Dependence::DVEntry &Level,
                                      const Constraint &CurConstraint) const {
@@ -3322,34 +3231,28 @@ void DependenceInfo::updateDirection(Dependence::DVEntry &Level,
     if (!SE->isKnownNonNegative(Level.Distance)) // if may be negative
       NewDirection |= Dependence::DVEntry::GT;
     Level.Direction &= NewDirection;
-  }
-  else if (CurConstraint.isLine()) {
+  } else if (CurConstraint.isLine()) {
     Level.Scalar = false;
     Level.Distance = nullptr;
     // direction should be accurate
-  }
-  else if (CurConstraint.isPoint()) {
+  } else if (CurConstraint.isPoint()) {
     Level.Scalar = false;
     Level.Distance = nullptr;
     unsigned NewDirection = Dependence::DVEntry::NONE;
-    if (!isKnownPredicate(CmpInst::ICMP_NE,
-                          CurConstraint.getY(),
+    if (!isKnownPredicate(CmpInst::ICMP_NE, CurConstraint.getY(),
                           CurConstraint.getX()))
       // if X may be = Y
       NewDirection |= Dependence::DVEntry::EQ;
-    if (!isKnownPredicate(CmpInst::ICMP_SLE,
-                          CurConstraint.getY(),
+    if (!isKnownPredicate(CmpInst::ICMP_SLE, CurConstraint.getY(),
                           CurConstraint.getX()))
       // if Y may be > X
       NewDirection |= Dependence::DVEntry::LT;
-    if (!isKnownPredicate(CmpInst::ICMP_SGE,
-                          CurConstraint.getY(),
+    if (!isKnownPredicate(CmpInst::ICMP_SGE, CurConstraint.getY(),
                           CurConstraint.getX()))
       // if Y may be < X
       NewDirection |= Dependence::DVEntry::GT;
     Level.Direction &= NewDirection;
-  }
-  else
+  } else
     llvm_unreachable("constraint has unexpected kind");
 }
 
@@ -3425,7 +3328,7 @@ bool DependenceInfo::tryDelinearizeFixedSize(
         dyn_cast<SCEVUnknown>(SE->getPointerBase(DstAccessFn));
     assert(SrcBase && DstBase && SrcBase == DstBase &&
            "expected src and dst scev unknowns to be equal");
-    });
+  });
 
   SmallVector<int, 4> SrcSizes;
   SmallVector<int, 4> DstSizes;
@@ -3737,9 +3640,8 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
     Pair[P].Group.resize(Pairs);
     removeMatchingExtensions(&Pair[P]);
     Pair[P].Classification =
-      classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()),
-                   Pair[P].Dst, LI->getLoopFor(Dst->getParent()),
-                   Pair[P].Loops);
+        classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()), Pair[P].Dst,
+                     LI->getLoopFor(Dst->getParent()), Pair[P].Loops);
     Pair[P].GroupLoops = Pair[P].Loops;
     Pair[P].Group.set(P);
     LLVM_DEBUG(dbgs() << "    subscript " << P << "\n");
@@ -3814,18 +3716,15 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
     if (Pair[SI].Classification == Subscript::NonLinear) {
       // ignore these, but collect loops for later
       ++NonlinearSubscriptPairs;
-      collectCommonLoops(Pair[SI].Src,
-                         LI->getLoopFor(Src->getParent()),
+      collectCommonLoops(Pair[SI].Src, LI->getLoopFor(Src->getParent()),
                          Pair[SI].Loops);
-      collectCommonLoops(Pair[SI].Dst,
-                         LI->getLoopFor(Dst->getParent()),
+      collectCommonLoops(Pair[SI].Dst, LI->getLoopFor(Dst->getParent()),
                          Pair[SI].Loops);
       Result.Consistent = false;
     } else if (Pair[SI].Classification == Subscript::ZIV) {
       // always separable
       Separable.set(SI);
-    }
-    else {
+    } else {
       // SIV, RDIV, or MIV, so check for coupled group
       bool Done = true;
       for (unsigned SJ = SI + 1; SJ < Pairs; ++SJ) {
@@ -3843,8 +3742,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
         if (Pair[SI].Group.count() == 1) {
           Separable.set(SI);
           ++SeparableSubscriptPairs;
-        }
-        else {
+        } else {
           Coupled.set(SI);
           ++CoupledSubscriptPairs;
         }
@@ -3950,10 +3848,9 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
                           Constraints, Result.Consistent)) {
               LLVM_DEBUG(dbgs() << "\t    Changed\n");
               ++DeltaPropagations;
-              Pair[SJ].Classification =
-                classifyPair(Pair[SJ].Src, LI->getLoopFor(Src->getParent()),
-                             Pair[SJ].Dst, LI->getLoopFor(Dst->getParent()),
-                             Pair[SJ].Loops);
+              Pair[SJ].Classification = classifyPair(
+                  Pair[SJ].Src, LI->getLoopFor(Src->getParent()), Pair[SJ].Dst,
+                  LI->getLoopFor(Dst->getParent()), Pair[SJ].Loops);
               switch (Pair[SJ].Classification) {
               case Subscript::ZIV:
                 LLVM_DEBUG(dbgs() << "ZIV\n");
@@ -3995,8 +3892,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
           LLVM_DEBUG(dbgs() << "MIV test\n");
           if (testMIV(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops, Result))
             return nullptr;
-        }
-        else
+        } else
           llvm_unreachable("expected only MIV subscripts at this point");
       }
 
@@ -4052,8 +3948,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
         break;
       }
     }
-  }
-  else {
+  } else {
     // On the other hand, if all directions are equal and there's no
     // loop-independent dependence possible, then no dependence exists.
     bool AllEqual = true;
@@ -4158,9 +4053,8 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
     Pair[P].Group.resize(Pairs);
     removeMatchingExtensions(&Pair[P]);
     Pair[P].Classification =
-      classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()),
-                   Pair[P].Dst, LI->getLoopFor(Dst->getParent()),
-                   Pair[P].Loops);
+        classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()), Pair[P].Dst,
+                     LI->getLoopFor(Dst->getParent()), Pair[P].Loops);
     Pair[P].GroupLoops = Pair[P].Loops;
     Pair[P].Group.set(P);
   }
@@ -4172,15 +4066,12 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
   for (unsigned SI = 0; SI < Pairs; ++SI) {
     if (Pair[SI].Classification == Subscript::NonLinear) {
       // ignore these, but collect loops for later
-      collectCommonLoops(Pair[SI].Src,
-                         LI->getLoopFor(Src->getParent()),
+      collectCommonLoops(Pair[SI].Src, LI->getLoopFor(Src->getParent()),
                          Pair[SI].Loops);
-      collectCommonLoops(Pair[SI].Dst,
-                         LI->getLoopFor(Dst->getParent()),
+      collectCommonLoops(Pair[SI].Dst, LI->getLoopFor(Dst->getParent()),
                          Pair[SI].Loops);
       Result.Consistent = false;
-    }
-    else if (Pair[SI].Classification == Subscript::ZIV)
+    } else if (Pair[SI].Classification == Subscript::ZIV)
       Separable.set(SI);
     else {
       // SIV, RDIV, or MIV, so check for coupled group
@@ -4214,8 +4105,8 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
     case Subscript::SIV: {
       unsigned Level;
       const SCEV *SplitIter = nullptr;
-      (void) testSIV(Pair[SI].Src, Pair[SI].Dst, Level,
-                     Result, NewConstraint, SplitIter);
+      (void)testSIV(Pair[SI].Src, Pair[SI].Dst, Level, Result, NewConstraint,
+                    SplitIter);
       if (Level == SplitLevel) {
         assert(SplitIter != nullptr);
         return SplitIter;
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index b3b4c37..425ea31 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -81,6 +81,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::exp:
   case Intrinsic::exp10:
   case Intrinsic::exp2:
+  case Intrinsic::ldexp:
   case Intrinsic::log:
   case Intrinsic::log10:
   case Intrinsic::log2:
@@ -108,6 +109,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::canonicalize:
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
+  case Intrinsic::lround:
+  case Intrinsic::llround:
   case Intrinsic::lrint:
   case Intrinsic::llrint:
   case Intrinsic::ucmp:
@@ -189,6 +192,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   switch (ID) {
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
+  case Intrinsic::lround:
+  case Intrinsic::llround:
   case Intrinsic::lrint:
   case Intrinsic::llrint:
   case Intrinsic::vp_lrint:
@@ -203,6 +208,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   case Intrinsic::vp_is_fpclass:
     return OpdIdx == 0;
   case Intrinsic::powi:
+  case Intrinsic::ldexp:
     return OpdIdx == -1 || OpdIdx == 1;
   default:
     return OpdIdx == -1;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index bfa72bf..b9e72c9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8889,6 +8889,44 @@ static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI,
   }
 }
 
+std::pair<SDValue, SDValue>
+SelectionDAG::getMemcmp(SDValue Chain, const SDLoc &dl, SDValue Mem0,
+                        SDValue Mem1, SDValue Size, const CallInst *CI) {
+  const char *LibCallName = TLI->getLibcallName(RTLIB::MEMCMP);
+  if (!LibCallName)
+    return {};
+
+  // Emit a library call.
+  auto GetEntry = [](Type *Ty, SDValue &SDV) {
+    TargetLowering::ArgListEntry E;
+    E.Ty = Ty;
+    E.Node = SDV;
+    return E;
+  };
+
+  PointerType *PT = PointerType::getUnqual(*getContext());
+  TargetLowering::ArgListTy Args = {
+      GetEntry(PT, Mem0), GetEntry(PT, Mem1),
+      GetEntry(getDataLayout().getIntPtrType(*getContext()), Size)};
+
+  TargetLowering::CallLoweringInfo CLI(*this);
+  bool IsTailCall = false;
+  bool ReturnsFirstArg = CI && funcReturnsFirstArgOfCall(*CI);
+  IsTailCall = CI && CI->isTailCall() &&
+               isInTailCallPosition(*CI, getTarget(), ReturnsFirstArg);
+
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setLibCallee(
+          TLI->getLibcallCallingConv(RTLIB::MEMCMP),
+          Type::getInt32Ty(*getContext()),
+          getExternalSymbol(LibCallName, TLI->getPointerTy(getDataLayout())),
+          std::move(Args))
+      .setTailCall(IsTailCall);
+
+  return TLI->LowerCallTo(CLI);
+}
+
 SDValue SelectionDAG::getMemcpy(
     SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size,
     Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 868e2f4..f5f5c14 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -9090,7 +9090,7 @@ bool SelectionDAGBuilder::visitMemCmpBCmpCall(const CallInst &I) {
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res = TSI.EmitTargetCodeForMemcmp(
       DAG, getCurSDLoc(), DAG.getRoot(), getValue(LHS), getValue(RHS),
-      getValue(Size), MachinePointerInfo(LHS), MachinePointerInfo(RHS));
+      getValue(Size), &I);
   if (Res.first.getNode()) {
     processIntegerCallValue(I, Res.first, true);
     PendingLoads.push_back(Res.second);
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 17a01f48..bf4c9f9 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1008,7 +1008,7 @@ unsigned TargetLoweringBase::getBitWidthForCttzElements(
     CR = CR.subtract(APInt(64, 1));
 
   unsigned EltWidth = RetTy->getScalarSizeInBits();
-  EltWidth = std::min(EltWidth, (unsigned)CR.getActiveBits());
+  EltWidth = std::min(EltWidth, CR.getActiveBits());
   EltWidth = std::max(llvm::bit_ceil(EltWidth), (unsigned)8);
 
   return EltWidth;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp b/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
index 987e639..a201fae 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
@@ -60,6 +60,20 @@ void DWARFGdbIndex::dumpSymbolTable(raw_ostream &OS) const {
                ", filled slots:",
                SymbolTableOffset, (uint64_t)SymbolTable.size())
      << '\n';
+
+  const auto FindCuVectorId = [&](uint32_t VecOffset) {
+    // Entries in ConstantPoolVectors are sorted by their offset in constant
+    // pool, see how ConstantPoolVectors is populated in parseImpl.
+    const auto *It =
+        llvm::lower_bound(ConstantPoolVectors, VecOffset,
+                          [](const auto &ConstantPoolEntry, uint32_t Offset) {
+                            return ConstantPoolEntry.first < Offset;
+                          });
+    assert(It != ConstantPoolVectors.end() && It->first == VecOffset &&
+           "Invalid symbol table");
+    return It - ConstantPoolVectors.begin();
+  };
+
   uint32_t I = -1;
   for (const SymTableEntry &E : SymbolTable) {
     ++I;
@@ -72,13 +86,7 @@ void DWARFGdbIndex::dumpSymbolTable(raw_ostream &OS) const {
     StringRef Name = ConstantPoolStrings.substr(
         ConstantPoolOffset - StringPoolOffset + E.NameOffset);
 
-    auto CuVector = llvm::find_if(
-        ConstantPoolVectors,
-        [&](const std::pair<uint32_t, SmallVector<uint32_t, 0>> &V) {
-          return V.first == E.VecOffset;
-        });
-    assert(CuVector != ConstantPoolVectors.end() && "Invalid symbol table");
-    uint32_t CuVectorId = CuVector - ConstantPoolVectors.begin();
+    const uint32_t CuVectorId = FindCuVectorId(E.VecOffset);
     OS << format("      String name: %s, CU vector index: %d\n", Name.data(),
                  CuVectorId);
   }
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 3aa4f7a..260d3c2 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -8956,7 +8956,8 @@ OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate(
     const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
     Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
-    AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
+    AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
+    bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
   assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
   if (!updateToLocation(Loc))
     return Loc.IP;
@@ -8974,9 +8975,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate(
            "OpenMP atomic does not support LT or GT operations");
   });
 
-  Expected<std::pair<Value *, Value *>> AtomicResult =
-      emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
-                       X.IsVolatile, IsXBinopExpr);
+  Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
+      AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
+      IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
   if (!AtomicResult)
     return AtomicResult.takeError();
   checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
@@ -9023,7 +9024,8 @@ Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
 Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
     InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
     AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
-    AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
+    AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
+    bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
   // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
   // or a complex datatype.
   bool emitRMWOp = false;
@@ -9046,7 +9048,20 @@ Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
 
   std::pair<Value *, Value *> Res;
   if (emitRMWOp) {
-    Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
+    AtomicRMWInst *RMWInst =
+        Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
+    if (T.isAMDGPU()) {
+      if (IsIgnoreDenormalMode)
+        RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
+                             llvm::MDNode::get(Builder.getContext(), {}));
+      if (!IsFineGrainedMemory)
+        RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
+                             llvm::MDNode::get(Builder.getContext(), {}));
+      if (!IsRemoteMemory)
+        RMWInst->setMetadata("amdgpu.no.remote.memory",
+                             llvm::MDNode::get(Builder.getContext(), {}));
+    }
+    Res.first = RMWInst;
     // not needed except in case of postfix captures. Generate anyway for
     // consistency with the else part. Will be removed with any DCE pass.
     // AtomicRMWInst::Xchg does not have a coressponding instruction.
@@ -9178,7 +9193,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
     const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
     AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
     AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
-    bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
+    bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
+    bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
   if (!updateToLocation(Loc))
     return Loc.IP;
 
@@ -9197,9 +9213,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
   // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
   // 'x' is simply atomically rewritten with 'expr'.
   AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
-  Expected<std::pair<Value *, Value *>> AtomicResult =
-      emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
-                       X.IsVolatile, IsXBinopExpr);
+  Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
+      AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
+      IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
   if (!AtomicResult)
     return AtomicResult.takeError();
   Value *CapturedVal =
diff --git a/llvm/lib/Support/StringRef.cpp b/llvm/lib/Support/StringRef.cpp
index dc75878..b6a2f8a 100644
--- a/llvm/lib/Support/StringRef.cpp
+++ b/llvm/lib/Support/StringRef.cpp
@@ -385,7 +385,7 @@ size_t StringRef::count(StringRef Str) const {
   return Count;
 }
 
-static unsigned GetAutoSenseRadix(StringRef &Str) {
+unsigned llvm::getAutoSenseRadix(StringRef &Str) {
   if (Str.empty())
     return 10;
 
@@ -410,7 +410,7 @@ bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix,
                                   unsigned long long &Result) {
   // Autosense radix if not specified.
   if (Radix == 0)
-    Radix = GetAutoSenseRadix(Str);
+    Radix = getAutoSenseRadix(Str);
 
   // Empty strings (after the radix autosense) are invalid.
   if (Str.empty()) return true;
@@ -509,7 +509,7 @@ bool StringRef::consumeInteger(unsigned Radix, APInt &Result) {
 
   // Autosense radix if not specified.
   if (Radix == 0)
-    Radix = GetAutoSenseRadix(Str);
+    Radix = getAutoSenseRadix(Str);
 
   assert(Radix > 1 && Radix <= 36);
 
diff --git a/llvm/lib/Support/regcomp.c b/llvm/lib/Support/regcomp.c
index 4ed5982..f5c4778 100644
--- a/llvm/lib/Support/regcomp.c
+++ b/llvm/lib/Support/regcomp.c
@@ -305,7 +305,7 @@ llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags) {
       return (REG_INVARG);
     len = preg->re_endp - pattern;
   } else {
-    len = strlen((const char *)pattern);
+    len = strlen(pattern);
   }
 
   /* do the mallocs early so failure handling is easy */
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a40de86b..3c06c6a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14742,6 +14742,106 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
   return ResultSLI;
 }
 
+static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                               const AArch64TargetLowering &TLI) {
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+
+  if (!VT.isVector())
+    return SDValue();
+
+  if (VT.isScalableVector() && !Subtarget.hasSVE2())
+    return SDValue();
+
+  if (VT.isFixedLengthVector() &&
+      (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() != ISD::AND)
+    return SDValue();
+
+  // InstCombine does (not (neg a)) => (add a -1).
+  // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
+  // Loop over all combinations of AND operands.
+  for (int i = 1; i >= 0; --i) {
+    for (int j = 1; j >= 0; --j) {
+      SDValue O0 = N0->getOperand(i);
+      SDValue O1 = N1->getOperand(j);
+      SDValue Sub, Add, SubSibling, AddSibling;
+
+      // Find a SUB and an ADD operand, one from each AND.
+      if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
+        Sub = O0;
+        Add = O1;
+        SubSibling = N0->getOperand(1 - i);
+        AddSibling = N1->getOperand(1 - j);
+      } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
+        Add = O0;
+        Sub = O1;
+        AddSibling = N0->getOperand(1 - i);
+        SubSibling = N1->getOperand(1 - j);
+      } else
+        continue;
+
+      if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
+        continue;
+
+      // Constant ones is always righthand operand of the Add.
+      if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
+        continue;
+
+      if (Sub.getOperand(1) != Add.getOperand(0))
+        continue;
+
+      return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
+    }
+  }
+
+  // (or (and a b) (and (not a) c)) => (bsl a b c)
+  // We only have to look for constant vectors here since the general, variable
+  // case can be handled in TableGen.
+  unsigned Bits = VT.getScalarSizeInBits();
+  uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
+  for (int i = 1; i >= 0; --i)
+    for (int j = 1; j >= 0; --j) {
+      APInt Val1, Val2;
+
+      if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
+          ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) &&
+          (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
+        return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
+                           N0->getOperand(1 - i), N1->getOperand(1 - j));
+      }
+      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
+      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
+      if (!BVN0 || !BVN1)
+        continue;
+
+      bool FoundMatch = true;
+      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+        if (!CN0 || !CN1 ||
+            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+          FoundMatch = false;
+          break;
+        }
+      }
+      if (FoundMatch)
+        return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
+                           N0->getOperand(1 - i), N1->getOperand(1 - j));
+    }
+
+  return SDValue();
+}
+
 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
                                              SelectionDAG &DAG) const {
   if (useSVEForFixedLengthVectorVT(Op.getValueType(),
@@ -19419,106 +19519,6 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
   return FixConv;
 }
 
-static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
-                               const AArch64TargetLowering &TLI) {
-  EVT VT = N->getValueType(0);
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
-
-  if (!VT.isVector())
-    return SDValue();
-
-  if (VT.isScalableVector() && !Subtarget.hasSVE2())
-    return SDValue();
-
-  if (VT.isFixedLengthVector() &&
-      (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
-    return SDValue();
-
-  SDValue N0 = N->getOperand(0);
-  if (N0.getOpcode() != ISD::AND)
-    return SDValue();
-
-  SDValue N1 = N->getOperand(1);
-  if (N1.getOpcode() != ISD::AND)
-    return SDValue();
-
-  // InstCombine does (not (neg a)) => (add a -1).
-  // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
-  // Loop over all combinations of AND operands.
-  for (int i = 1; i >= 0; --i) {
-    for (int j = 1; j >= 0; --j) {
-      SDValue O0 = N0->getOperand(i);
-      SDValue O1 = N1->getOperand(j);
-      SDValue Sub, Add, SubSibling, AddSibling;
-
-      // Find a SUB and an ADD operand, one from each AND.
-      if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
-        Sub = O0;
-        Add = O1;
-        SubSibling = N0->getOperand(1 - i);
-        AddSibling = N1->getOperand(1 - j);
-      } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
-        Add = O0;
-        Sub = O1;
-        AddSibling = N0->getOperand(1 - i);
-        SubSibling = N1->getOperand(1 - j);
-      } else
-        continue;
-
-      if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
-        continue;
-
-      // Constant ones is always righthand operand of the Add.
-      if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
-        continue;
-
-      if (Sub.getOperand(1) != Add.getOperand(0))
-        continue;
-
-      return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
-    }
-  }
-
-  // (or (and a b) (and (not a) c)) => (bsl a b c)
-  // We only have to look for constant vectors here since the general, variable
-  // case can be handled in TableGen.
-  unsigned Bits = VT.getScalarSizeInBits();
-  uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
-  for (int i = 1; i >= 0; --i)
-    for (int j = 1; j >= 0; --j) {
-      APInt Val1, Val2;
-
-      if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
-          ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) &&
-          (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
-        return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
-                           N0->getOperand(1 - i), N1->getOperand(1 - j));
-      }
-      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
-      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
-      if (!BVN0 || !BVN1)
-        continue;
-
-      bool FoundMatch = true;
-      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
-        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
-        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
-        if (!CN0 || !CN1 ||
-            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
-          FoundMatch = false;
-          break;
-        }
-      }
-      if (FoundMatch)
-        return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
-                           N0->getOperand(1 - i), N1->getOperand(1 - j));
-    }
-
-  return SDValue();
-}
-
 // Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
 // convert to csel(ccmp(.., cc0)), depending on cc1:
 
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index b97d622..fd4ef2a 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -8,8 +8,8 @@
 //
 // This pass performs below peephole optimizations on MIR level.
 //
-// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
-//    MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
+// 1. MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
+//    MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
 //
 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
 //    MOVi64imm + ADDXrr ==> ADDXri + ADDXri
@@ -128,6 +128,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   // Strategy used to split logical immediate bitmasks.
   enum class SplitStrategy {
     Intersect,
+    Disjoint,
   };
   template <typename T>
   bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
@@ -163,6 +164,7 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
 template <typename T>
 static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
   T UImm = static_cast<T>(Imm);
+  assert(UImm && (UImm != ~static_cast<T>(0)) && "Invalid immediate!");
 
   // The bitmask immediate consists of consecutive ones.  Let's say there is
   // constant 0b00000000001000000000010000000000 which does not consist of
@@ -191,18 +193,47 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
 }
 
 template <typename T>
+static bool splitDisjointBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc,
+                                    T &Imm2Enc) {
+  assert(Imm && (Imm != ~static_cast<T>(0)) && "Invalid immediate!");
+
+  // Try to split a bitmask of the form 0b00000000011000000000011110000000 into
+  // two disjoint masks such as 0b00000000011000000000000000000000 and
+  // 0b00000000000000000000011110000000 where the inclusive/exclusive OR of the
+  // new masks match the original mask.
+  unsigned LowestBitSet = llvm::countr_zero(Imm);
+  unsigned LowestGapBitUnset =
+      LowestBitSet + llvm::countr_one(Imm >> LowestBitSet);
+
+  // Create a mask for the least significant group of consecutive ones.
+  assert(LowestGapBitUnset < sizeof(T) * CHAR_BIT && "Undefined behaviour!");
+  T NewImm1 = (static_cast<T>(1) << LowestGapBitUnset) -
+              (static_cast<T>(1) << LowestBitSet);
+  // Create a disjoint mask for the remaining ones.
+  T NewImm2 = Imm & ~NewImm1;
+
+  // Do not split if NewImm2 is not a valid bitmask immediate.
+  if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
+    return false;
+
+  Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
+  Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
+  return true;
+}
+
+template <typename T>
 bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
                                               SplitStrategy Strategy,
                                               unsigned OtherOpc) {
-  // Try below transformation.
+  // Try below transformations.
   //
-  // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
-  // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
+  // MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
+  // MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
   //
   // The mov pseudo instruction could be expanded to multiple mov instructions
   // later. Let's try to split the constant operand of mov instruction into two
-  // bitmask immediates. It makes only two AND instructions instead of multiple
-  // mov + and instructions.
+  // bitmask immediates based on the given split strategy. It makes only two
+  // logical instructions instead of multiple mov + logic instructions.
 
   return splitTwoPartImm<T>(
       MI,
@@ -224,6 +255,9 @@ bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
         case SplitStrategy::Intersect:
           SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1);
           break;
+        case SplitStrategy::Disjoint:
+          SplitSucc = splitDisjointBitmaskImm(Imm, RegSize, Imm0, Imm1);
+          break;
         }
         if (SplitSucc)
           return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc);
@@ -889,6 +923,22 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
         Changed |= trySplitLogicalImm<uint64_t>(
             AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri);
         break;
+      case AArch64::EORWrr:
+        Changed |= trySplitLogicalImm<uint32_t>(AArch64::EORWri, MI,
+                                                SplitStrategy::Disjoint);
+        break;
+      case AArch64::EORXrr:
+        Changed |= trySplitLogicalImm<uint64_t>(AArch64::EORXri, MI,
+                                                SplitStrategy::Disjoint);
+        break;
+      case AArch64::ORRWrr:
+        Changed |= trySplitLogicalImm<uint32_t>(AArch64::ORRWri, MI,
+                                                SplitStrategy::Disjoint);
+        break;
+      case AArch64::ORRXrr:
+        Changed |= trySplitLogicalImm<uint64_t>(AArch64::ORRXri, MI,
+                                                SplitStrategy::Disjoint);
+        break;
       case AArch64::ORRWrs:
         Changed |= visitORR(MI);
         break;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index adc984a..1bc1d98 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -22,7 +22,8 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320",
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeaturePostRAScheduler,
-                                   FeatureUseWzrToVecMove]>;
+                                   FeatureUseWzrToVecMove,
+                                   FeatureUseFixedOverScalableIfEqualCost]>;
 
 def TuneA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors", [
@@ -45,7 +46,8 @@ def TuneA510    : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510",
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeaturePostRAScheduler,
-                                   FeatureUseWzrToVecMove
+                                   FeatureUseWzrToVecMove,
+                                   FeatureUseFixedOverScalableIfEqualCost
                                    ]>;
 
 def TuneA520    : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
@@ -53,7 +55,8 @@ def TuneA520    : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeaturePostRAScheduler,
-                                   FeatureUseWzrToVecMove]>;
+                                   FeatureUseWzrToVecMove,
+                                   FeatureUseFixedOverScalableIfEqualCost]>;
 
 def TuneA520AE  : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520",
                                    "Cortex-A520AE ARM processors", [
@@ -756,7 +759,6 @@ def ProcessorFeatures {
                                  FeatureSB, FeaturePAuth, FeatureSSBS, FeatureSVE, FeatureSVE2,
                                  FeatureComplxNum, FeatureCRC, FeatureDotProd,
                                  FeatureFPARMv8,FeatureFullFP16, FeatureJS, FeatureLSE,
-                                 FeatureUseFixedOverScalableIfEqualCost,
                                  FeatureRAS, FeatureRCPC, FeatureRDM, FeatureFPAC];
   list<SubtargetFeature> A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
                                  FeatureMTE, FeatureETE, FeatureSVEBitPerm,
@@ -766,7 +768,6 @@ def ProcessorFeatures {
                                  FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC,
                                  FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8, FeatureJS,
                                  FeatureNEON, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM,
-                                 FeatureUseFixedOverScalableIfEqualCost,
                                  FeatureDotProd, FeatureFPAC];
   list<SubtargetFeature> A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
                                  FeatureMTE, FeatureETE, FeatureSVEBitPerm,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index fb83388..9d6584a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3212,6 +3212,44 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
     Src = Src.getOperand(0);
   }
 
+  if (Mods != SISrcMods::NONE)
+    return true;
+
+  // Convert various sign-bit masks on integers to src mods. Currently disabled
+  // for 16-bit types as the codegen replaces the operand without adding a
+  // srcmod. This is intentionally finding the cases where we are performing
+  // float neg and abs on int types, the goal is not to obtain two's complement
+  // neg or abs. Limit converison to select operands via the nonCanonalizing
+  // pattern.
+  // TODO: Add 16-bit support.
+  if (IsCanonicalizing)
+    return true;
+
+  unsigned Opc = Src->getOpcode();
+  EVT VT = Src.getValueType();
+  if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
+      (VT != MVT::i32 && VT != MVT::i64))
+    return true;
+
+  ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Src->getOperand(1));
+  if (!CRHS)
+    return true;
+
+  // Recognise (xor a, 0x80000000) as NEG SrcMod.
+  // Recognise (and a, 0x7fffffff) as ABS SrcMod.
+  // Recognise (or a, 0x80000000) as NEG+ABS SrcModifiers.
+  if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
+    Mods |= SISrcMods::NEG;
+    Src = Src.getOperand(0);
+  } else if (Opc == ISD::AND && AllowAbs &&
+             CRHS->getAPIntValue().isMaxSignedValue()) {
+    Mods |= SISrcMods::ABS;
+    Src = Src.getOperand(0);
+  } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
+    Mods |= SISrcMods::ABS | SISrcMods::NEG;
+    Src = Src.getOperand(0);
+  }
+
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 0c653b1..962c276 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -2081,7 +2081,9 @@ SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
   case AMDGPU::V_MAX_F16_fake16_e64:
   case AMDGPU::V_MAX_F64_e64:
   case AMDGPU::V_MAX_NUM_F64_e64:
-  case AMDGPU::V_PK_MAX_F16: {
+  case AMDGPU::V_PK_MAX_F16:
+  case AMDGPU::V_MAX_BF16_PSEUDO_e64:
+  case AMDGPU::V_PK_MAX_NUM_BF16: {
     if (MI.mayRaiseFPException())
       return nullptr;
 
@@ -2108,8 +2110,10 @@ SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
 
     // Having a 0 op_sel_hi would require swizzling the output in the source
     // instruction, which we can't do.
-    unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
-                                                      : 0u;
+    unsigned UnsetMods =
+        (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16)
+            ? SISrcMods::OP_SEL_1
+            : 0u;
     if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
       return nullptr;
     return Src0;
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 3955f2a..25ad9ec 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -669,7 +669,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       default: {
         // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows
         // us to  fold the constant into the cmp instruction.
-        RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+        RHS = DAG.getSignedConstant(C->getSExtValue() + 1, DL, VT);
         CC = ISD::SETGE;
         break;
       }
@@ -713,7 +713,10 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to
     // fold the constant into the cmp instruction.
     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
-      RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+      // Doing a "icmp ugt i16 65535, %0" comparison should have been converted
+      // already to something else. Assert to make sure this assumption holds.
+      assert((!C->isAllOnes()) && "integer overflow in comparison transform");
+      RHS = DAG.getConstant(C->getZExtValue() + 1, DL, VT);
       CC = ISD::SETUGE;
       break;
     }
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index ffd900c..5153d24 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -56,6 +56,8 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
   case Intrinsic::dx_wave_reduce_sum:
   case Intrinsic::dx_wave_reduce_umax:
   case Intrinsic::dx_wave_reduce_usum:
+  case Intrinsic::dx_imad:
+  case Intrinsic::dx_umad:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 9003ace..d4f0cc9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -4046,6 +4046,18 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     return true;
   }
 
+  case Intrinsic::nvvm_prefetch_tensormap: {
+    auto &DL = I.getDataLayout();
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = getPointerTy(DL);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.flags =
+        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
+    Info.align.reset();
+    return true;
+  }
+
   case Intrinsic::nvvm_ldu_global_i:
   case Intrinsic::nvvm_ldu_global_f:
   case Intrinsic::nvvm_ldu_global_p: {
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index d337192..d4a0ca7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -39,6 +39,12 @@ def AS_match {
   code global = [{
    return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
   }];
+  code const = [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_CONST);
+  }];
+  code param = [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_PARAM);
+  }];
 }
 
 
@@ -950,33 +956,47 @@ foreach dim = 3...5 in {
 defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4",
                                      [hasTMACTAGroupSupport]>;
 
-//Prefetch and Prefetchu 
-
-let Predicates = [hasPTX<80>, hasSM<90>] in {
-  class PREFETCH_INTRS<string InstName> :
-            BasicNVPTXInst<(outs), (ins ADDR:$addr),
-            InstName,
-            [(!cast<Intrinsic>(!strconcat("int_nvvm_",
-            !subst(".", "_", InstName))) addr:$addr)]>;
+//Prefetchu and Prefetch
 
-  def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1">;
-  def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2">;
-  def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1">;
-  def PREFETCH_LOCAL_L1  : PREFETCH_INTRS<"prefetch.local.L1">;
-  def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">;
-  def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">;
+defvar frag_pat = (int_nvvm_prefetch_tensormap node:$addr);
 
-  def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr),
-                                        "prefetch.global.L2::evict_normal",
-                                        [(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>;
+multiclass PREFETCH_TENSORMAP_PATFRAG<string suffix, code predicate> {
+  def !tolower(suffix) : PatFrag<!setdagop(frag_pat, ops), frag_pat, predicate>;
+}
 
-  def PREFETCH_GLOBAL_L2_EVICT_LAST   : BasicNVPTXInst<(outs), (ins ADDR:$addr),
-                                        "prefetch.global.L2::evict_last",
-                                        [(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>;
+defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"CONST", AS_match.const>;
+defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"GENERIC", AS_match.generic>;
+defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"PARAM", AS_match.param>;
 
-  def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">;
+multiclass PREFETCH_TENSORMAP_INST<string addrspace_name, PatFrag pattern_frag> {
+  def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr),
+           "prefetch" # addrspace_name # ".tensormap",
+           [(pattern_frag addr:$addr)]>,
+           Requires<[hasPTX<80>, hasSM<90>]>;
 }
 
+defm PREFETCH_CONST_TENSORMAP   : PREFETCH_TENSORMAP_INST<".const", prefetch_tensormap_const>;
+defm PREFETCH_GENERIC_TENSORMAP : PREFETCH_TENSORMAP_INST<"", prefetch_tensormap_generic>;
+defm PREFETCH_PARAM_TENSORMAP   : PREFETCH_TENSORMAP_INST<".param", prefetch_tensormap_param>;
+  
+class PREFETCH_INTRS<string InstName, Intrinsic Intr> :
+          BasicNVPTXInst<(outs), (ins ADDR:$addr),
+          InstName,
+          [(Intr addr:$addr)]>,
+          Requires<[hasPTX<80>, hasSM<90>]>;
+
+def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1", int_nvvm_prefetchu_L1>;   
+def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1", int_nvvm_prefetch_L1>;
+def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2", int_nvvm_prefetch_L2>;
+def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1", int_nvvm_prefetch_global_L1>;
+def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1", int_nvvm_prefetch_local_L1>;
+def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2", int_nvvm_prefetch_global_L2>;
+def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2", int_nvvm_prefetch_local_L2>;
+def PREFETCH_GLOBAL_L2_EVICT_NORMAL : PREFETCH_INTRS<"prefetch.global.L2::evict_normal", 
+                                      int_nvvm_prefetch_global_L2_evict_normal>;
+def PREFETCH_GLOBAL_L2_EVICT_LAST : PREFETCH_INTRS<"prefetch.global.L2::evict_last", 
+                                    int_nvvm_prefetch_global_L2_evict_last>;
+
 //Applypriority intrinsics
 class APPLYPRIORITY_L2_INTRS<string addrspace> :
           BasicNVPTXInst<(outs), (ins ADDR:$addr, B64:$size),
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 3ae2d9d..f4f8961 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -564,7 +564,8 @@ bool NVPTXTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
   case Intrinsic::nvvm_isspacep_global:
   case Intrinsic::nvvm_isspacep_local:
   case Intrinsic::nvvm_isspacep_shared:
-  case Intrinsic::nvvm_isspacep_shared_cluster: {
+  case Intrinsic::nvvm_isspacep_shared_cluster:
+  case Intrinsic::nvvm_prefetch_tensormap: {
     OpIndexes.push_back(0);
     return true;
   }
@@ -587,6 +588,11 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
       return ConstantInt::get(II->getType(), *R);
     return nullptr;
   }
+  case Intrinsic::nvvm_prefetch_tensormap: {
+    IRBuilder<> Builder(II);
+    return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap,
+                                        NewV);
+  }
   }
   return nullptr;
 }
diff --git a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
index 95de9f3..4039fed 100644
--- a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
@@ -22,3 +22,9 @@ bool PPCSelectionDAGInfo::isTargetStrictFPOpcode(unsigned Opcode) const {
   return Opcode >= PPCISD::FIRST_STRICTFP_OPCODE &&
          Opcode <= PPCISD::LAST_STRICTFP_OPCODE;
 }
+
+std::pair<SDValue, SDValue> PPCSelectionDAGInfo::EmitTargetCodeForMemcmp(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Op1, SDValue Op2,
+    SDValue Op3, const CallInst *CI) const {
+  return DAG.getMemcmp(Chain, dl, Op1, Op2, Op3, CI);
+}
diff --git a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
index 08e2ddb..1537851 100644
--- a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
@@ -20,6 +20,11 @@ public:
   bool isTargetMemoryOpcode(unsigned Opcode) const override;
 
   bool isTargetStrictFPOpcode(unsigned Opcode) const override;
+
+  std::pair<SDValue, SDValue>
+  EmitTargetCodeForMemcmp(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
+                          SDValue Op1, SDValue Op2, SDValue Op3,
+                          const CallInst *CI) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 95ec42f..a997ea5 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -471,6 +471,8 @@ bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
     Count -= 1;
   }
 
+  // TODO: emit a mapping symbol right here
+
   if (Count % 4 == 2) {
     // The canonical nop with Zca is c.nop.
     OS.write(STI->hasFeature(RISCV::FeatureStdExtZca) ? "\x01\0" : "\0\0", 2);
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index bf23812..5541506 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -13,78 +13,113 @@
 //
 //===----------------------------------------------------------------------===//
 
-class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
-  string LLMUL = LargestLMUL<MxList>.r;
-  bit c = !eq(mx, LLMUL);
-}
+//===----------------------------------------------------------------------===//
+// Helpers
+
+// Maps LMUL string to corresponding value from the Values array
+// LMUL values map to array indices as follows:
+//   MF8 -> Values[0], MF4 -> Values[1], MF2 -> Values[2], M1 -> Values[3],
+//   M2 -> Values[4], M4 -> Values[5], M8 -> Values[6]
+// Shorter lists are allowed, e.g., widening instructions don't work on M8
+class GetLMULValue<list<int> Values, string LMUL> {
+  defvar Index = !cond(
+    !eq(LMUL, "MF8"): 0,
+    !eq(LMUL, "MF4"): 1,
+    !eq(LMUL, "MF2"): 2,
+    !eq(LMUL, "M1"):  3,
+    !eq(LMUL, "M2"):  4,
+    !eq(LMUL, "M4"):  5,
+    !eq(LMUL, "M8"):  6,
+  );
 
-class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
-  string LLMUL = LargestLMUL<MxList>.r;
-  int SSEW = SmallestSEW<mx, isF>.r;
-  bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+  assert !lt(Index, !size(Values)),
+    "Missing LMUL value for '" # LMUL # "'. " #
+    "Expected at least " # !add(Index, 1) # " elements, but got " #
+    !size(Values) # ".";
+
+  int c = Values[Index];
 }
 
-defvar SMX60VLEN = 256;
-defvar SMX60DLEN = !div(SMX60VLEN, 2);
+// Returns BaseValue for LMUL values before startLMUL, Value for startLMUL,
+// then doubles Value for each subsequent LMUL
+// Example: ConstValueUntilLMULThenDoubleBase<"M1", 2, 4, "M8"> returns:
+//   MF8->2, MF4->2, MF2->2, M1->4, M2->8, M4->16, M8->32
+// This is useful for modeling scheduling parameters that scale with LMUL.
+class ConstValueUntilLMULThenDoubleBase<string startLMUL, int BaseValue, int Value, string currentLMUL> {
+  assert !le(BaseValue, Value), "BaseValue must be less-equal to Value";
+  defvar startPos = GetLMULValue<[0, 1, 2, 3, 4, 5, 6], startLMUL>.c;
+  defvar currentPos = GetLMULValue<[0, 1, 2, 3, 4, 5, 6], currentLMUL>.c;
 
-class Get1248Latency<string mx> {
+  // Calculate the difference in positions
+  defvar posDiff = !sub(currentPos, startPos);
+
+  // Calculate Value * (2^posDiff)
   int c = !cond(
-    !eq(mx, "M2") : 2,
-    !eq(mx, "M4") : 4,
-    !eq(mx, "M8") : 8,
-    true: 1
+    !eq(posDiff, 0) : Value,
+    !eq(posDiff, 1) : !mul(Value, 2),
+    !eq(posDiff, 2) : !mul(Value, 4),
+    !eq(posDiff, 3) : !mul(Value, 8),
+    !eq(posDiff, 4) : !mul(Value, 16),
+    !eq(posDiff, 5) : !mul(Value, 32),
+    !eq(posDiff, 6) : !mul(Value, 64),
+    true : BaseValue
   );
 }
 
-// Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides
-class Get4816Latency<string mx> {
-  int c = !cond(
-    !eq(mx, "M4") : 8,
-    !eq(mx, "M8") : 16,
-    true: 4
-  );
+// Same as the previous function but BaseValue == Value
+class ConstValueUntilLMULThenDouble<string startLMUL, int Value, string currentLMUL> {
+  int c = ConstValueUntilLMULThenDoubleBase<startLMUL, Value, Value, currentLMUL>.c;
+}
+
+// Returns MF8->1, MF4->1, MF2->2, M1->4, M2->8, M4->16, M8->32
+class ConstOneUntilMF4ThenDouble<string mx> {
+  int c = ConstValueUntilLMULThenDouble<"MF4", 1, mx>.c;
+}
+
+// Returns MF8->1, MF4->1, MF2->1, M1->2, M2->4, M4->8, M8->16
+class ConstOneUntilMF2ThenDouble<string mx> {
+  int c = ConstValueUntilLMULThenDouble<"MF2", 1, mx>.c;
+}
+
+// Returns MF8->1, MF4->1, MF2->1, M1->1, M2->2, M4->4, M8->8
+class ConstOneUntilM1ThenDouble<string mx> {
+  int c = ConstValueUntilLMULThenDouble<"M1", 1, mx>.c;
 }
 
+//===----------------------------------------------------------------------===//
+// Latency helper classes
+
 // Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max
-class Get458Latency<string mx> {
-  int c = !cond(
-    !eq(mx, "M4") : 5,
-    !eq(mx, "M8") : 8,
-    true: 4
-  );
+class Get4458Latency<string mx> {
+  int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/4, /*M4=*/5, /*M8=*/8], mx>.c;
 }
 
-// Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs
-// Used for: widening operations
+// Used for: widening operations (no M8)
 class Get4588Latency<string mx> {
-  int c = !cond(
-    !eq(mx, "M2") : 5,
-    !eq(mx, "M4") : 8,
-    !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback
-    true: 4
-  );
+  int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/5, /*M4=*/8], mx>.c;
 }
 
 // Used for: mask-producing comparisons, carry ops with mask, FP comparisons
 class Get461018Latency<string mx> {
-  int c = !cond(
-    !eq(mx, "M2") : 6,
-    !eq(mx, "M4") : 10,
-    !eq(mx, "M8") : 18,
-    true: 4
-  );
+  int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/6, /*M4=*/10, /*M8=*/18], mx>.c;
 }
 
-// Used for: e64 multiply pattern, complex ops
-class Get781632Latency<string mx> {
-  int c = !cond(
-    !eq(mx, "M2") : 8,
-    !eq(mx, "M4") : 16,
-    !eq(mx, "M8") : 32,
-    true: 7
-  );
+//===----------------------------------------------------------------------===//
+
+class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  bit c = !eq(mx, LLMUL);
 }
 
+class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  int SSEW = SmallestSEW<mx, isF>.r;
+  bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
+defvar SMX60VLEN = 256;
+defvar SMX60DLEN = !div(SMX60VLEN, 2);
+
 def SpacemitX60Model : SchedMachineModel {
   let IssueWidth        = 2; // dual-issue
   let MicroOpBufferSize = 0; // in-order
@@ -383,12 +418,13 @@ foreach LMul = [1, 2, 4, 8] in {
 foreach mx = SchedMxList in {
   defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
 
-  let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [4] in {
+  let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [4] in {
     defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
   }
 
-  let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in {
+  defvar VIALULat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
+  let Latency = VIALULat, ReleaseAtCycles = [4] in {
     // Pattern of vadd, vsub, vrsub: 4/4/5/8
     // Pattern of vand, vor, vxor:   4/4/8/16
     // They are grouped together, so we used the worst case 4/4/8/16
@@ -425,7 +461,7 @@ foreach mx = SchedMxList in {
   // Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8,
   // e64 = 7,8,16,32. We use the worst-case until we can split the SEW.
   // TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites
-  let Latency = Get781632Latency<mx>.c, ReleaseAtCycles = [7] in {
+  let Latency = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c, ReleaseAtCycles = [7] in {
     defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
@@ -461,15 +497,8 @@ foreach mx = SchedMxList in {
   foreach sew = SchedSEWSet<mx>.val in {
     defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
 
-    // Slightly reduced for fractional LMULs
-    defvar Multiplier = !cond(
-      !eq(mx, "MF8") : 12,
-      !eq(mx, "MF4") : 12,
-      !eq(mx, "MF2") : 12,
-      true: 24
-    );
-
-    let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in {
+    defvar VIDivLat = ConstValueUntilLMULThenDouble<"MF2", 12, mx>.c;
+    let Latency = VIDivLat, ReleaseAtCycles = [12] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
     }
@@ -480,14 +509,8 @@ foreach mx = SchedMxList in {
 foreach mx = SchedMxListW in {
   defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
 
-  // Slightly increased for integer LMULs
-  defvar Multiplier = !cond(
-    !eq(mx, "M2") : 2,
-    !eq(mx, "M4") : 2,
-    true: 1
-  );
-
-  let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in {
+  defvar VNarrowingLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
+  let Latency = VNarrowingLat, ReleaseAtCycles = [4] in {
     defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index d76babe..afe838a 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -181,8 +181,7 @@ static SDValue addIPMSequence(const SDLoc &DL, SDValue CCReg,
 
 std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemcmp(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src1,
-    SDValue Src2, SDValue Size, MachinePointerInfo Op1PtrInfo,
-    MachinePointerInfo Op2PtrInfo) const {
+    SDValue Src2, SDValue Size, const CallInst *CI) const {
   SDValue CCReg;
   // Swap operands to invert CC == 1 vs. CC == 2 cases.
   if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index c928f34..5a1e0cd 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -41,8 +41,7 @@ public:
   std::pair<SDValue, SDValue>
   EmitTargetCodeForMemcmp(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
                           SDValue Src1, SDValue Src2, SDValue Size,
-                          MachinePointerInfo Op1PtrInfo,
-                          MachinePointerInfo Op2PtrInfo) const override;
+                          const CallInst *CI) const override;
 
   std::pair<SDValue, SDValue>
   EmitTargetCodeForMemchr(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index b03b350..fc852d0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -136,6 +136,15 @@ static APInt encodeFunctionSignature(SelectionDAG *DAG, SDLoc &DL,
     if (VT == MVT::f64) {
       return wasm::ValType::F64;
     }
+    if (VT == MVT::externref) {
+      return wasm::ValType::EXTERNREF;
+    }
+    if (VT == MVT::funcref) {
+      return wasm::ValType::FUNCREF;
+    }
+    if (VT == MVT::exnref) {
+      return wasm::ValType::EXNREF;
+    }
     LLVM_DEBUG(errs() << "Unhandled type for llvm.wasm.ref.test.func: " << VT
                       << "\n");
     llvm_unreachable("Unhandled type for llvm.wasm.ref.test.func");
diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
index 46b5673..4edf25c 100644
--- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -789,6 +789,13 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
   bool NeedsMemMove = false;
   IRBuilder<> IRB(BB, IP);
 
+  auto GetAllocaSize = [&](AllocaInst *AI) {
+    return IRB.CreateMul(
+        IRB.CreateZExtOrTrunc(AI->getArraySize(), IntptrTy),
+        ConstantInt::get(IntptrTy,
+                         DL.getTypeAllocSize(AI->getAllocatedType())));
+  };
+
   if (auto *A = dyn_cast<Argument>(V)) {
     assert(A->hasByValAttr() && "Type reset for non-byval argument?");
 
@@ -811,7 +818,11 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
         }
       }
     } else if (auto *II = dyn_cast<LifetimeIntrinsic>(I)) {
-      Size = II->getArgOperand(0);
+      auto *AI = dyn_cast<AllocaInst>(II->getArgOperand(1));
+      if (!AI)
+        return false;
+
+      Size = GetAllocaSize(AI);
       Dest = II->getArgOperand(1);
     } else if (auto *AI = dyn_cast<AllocaInst>(I)) {
       // We need to clear the types for new stack allocations (or else we might
@@ -820,10 +831,7 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
       IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(I)));
       IRB.SetInstDebugLocation(I);
 
-      Size = IRB.CreateMul(
-          IRB.CreateZExtOrTrunc(AI->getArraySize(), IntptrTy),
-          ConstantInt::get(IntptrTy,
-                           DL.getTypeAllocSize(AI->getAllocatedType())));
+      Size = GetAllocaSize(AI);
       Dest = I;
     } else {
       return false;
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 938aab5..a7ba54f 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -582,15 +582,17 @@ struct AllSwitchPaths {
     VisitedBlocks VB;
     // Get paths from the determinator BBs to SwitchPhiDefBB
     std::vector<ThreadingPath> PathsToPhiDef =
-        getPathsFromStateDefMap(StateDef, SwitchPhi, VB);
+        getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths);
     if (SwitchPhiDefBB == SwitchBlock) {
       TPaths = std::move(PathsToPhiDef);
       return;
     }
 
+    assert(MaxNumPaths >= PathsToPhiDef.size());
+    auto PathsLimit = MaxNumPaths / PathsToPhiDef.size();
     // Find and append paths from SwitchPhiDefBB to SwitchBlock.
     PathsType PathsToSwitchBB =
-        paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1);
+        paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit);
     if (PathsToSwitchBB.empty())
       return;
 
@@ -611,13 +613,16 @@ private:
   typedef DenseMap<const BasicBlock *, const PHINode *> StateDefMap;
   std::vector<ThreadingPath> getPathsFromStateDefMap(StateDefMap &StateDef,
                                                      PHINode *Phi,
-                                                     VisitedBlocks &VB) {
+                                                     VisitedBlocks &VB,
+                                                     unsigned PathsLimit) {
     std::vector<ThreadingPath> Res;
     auto *PhiBB = Phi->getParent();
     VB.insert(PhiBB);
 
     VisitedBlocks UniqueBlocks;
     for (auto *IncomingBB : Phi->blocks()) {
+      if (Res.size() >= PathsLimit)
+        break;
       if (!UniqueBlocks.insert(IncomingBB).second)
         continue;
       if (!SwitchOuterLoop->contains(IncomingBB))
@@ -653,8 +658,9 @@ private:
 
       // Direct predecessor, just add to the path.
       if (IncomingPhiDefBB == IncomingBB) {
-        std::vector<ThreadingPath> PredPaths =
-            getPathsFromStateDefMap(StateDef, IncomingPhi, VB);
+        assert(PathsLimit > Res.size());
+        std::vector<ThreadingPath> PredPaths = getPathsFromStateDefMap(
+            StateDef, IncomingPhi, VB, PathsLimit - Res.size());
         for (ThreadingPath &Path : PredPaths) {
           Path.push_back(PhiBB);
           Res.push_back(std::move(Path));
@@ -667,13 +673,17 @@ private:
         continue;
 
       PathsType IntermediatePaths;
-      IntermediatePaths =
-          paths(IncomingPhiDefBB, IncomingBB, VB, /* PathDepth = */ 1);
+      assert(PathsLimit > Res.size());
+      auto InterPathLimit = PathsLimit - Res.size();
+      IntermediatePaths = paths(IncomingPhiDefBB, IncomingBB, VB,
+                                /* PathDepth = */ 1, InterPathLimit);
       if (IntermediatePaths.empty())
         continue;
 
+      assert(InterPathLimit >= IntermediatePaths.size());
+      auto PredPathLimit = InterPathLimit / IntermediatePaths.size();
       std::vector<ThreadingPath> PredPaths =
-          getPathsFromStateDefMap(StateDef, IncomingPhi, VB);
+          getPathsFromStateDefMap(StateDef, IncomingPhi, VB, PredPathLimit);
       for (const ThreadingPath &Path : PredPaths) {
         for (const PathType &IPath : IntermediatePaths) {
           ThreadingPath NewPath(Path);
@@ -688,7 +698,7 @@ private:
   }
 
   PathsType paths(BasicBlock *BB, BasicBlock *ToBB, VisitedBlocks &Visited,
-                  unsigned PathDepth) {
+                  unsigned PathDepth, unsigned PathsLimit) {
     PathsType Res;
 
     // Stop exploring paths after visiting MaxPathLength blocks
@@ -715,6 +725,8 @@ private:
     // is used to prevent a duplicate path from being generated
     SmallSet<BasicBlock *, 4> Successors;
     for (BasicBlock *Succ : successors(BB)) {
+      if (Res.size() >= PathsLimit)
+        break;
       if (!Successors.insert(Succ).second)
         continue;
 
@@ -736,14 +748,12 @@ private:
       // coverage and compile time.
       if (LI->getLoopFor(Succ) != CurrLoop)
         continue;
-
-      PathsType SuccPaths = paths(Succ, ToBB, Visited, PathDepth + 1);
+      assert(PathsLimit > Res.size());
+      PathsType SuccPaths =
+          paths(Succ, ToBB, Visited, PathDepth + 1, PathsLimit - Res.size());
       for (PathType &Path : SuccPaths) {
         Path.push_front(BB);
         Res.push_back(Path);
-        if (Res.size() >= MaxNumPaths) {
-          return Res;
-        }
       }
     }
     // This block could now be visited again from a different predecessor. Note
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5d0e2f9..39011e7 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3883,6 +3883,7 @@ private:
     enum CombinedOpcode {
       NotCombinedOp = -1,
       MinMax = Instruction::OtherOpsEnd + 1,
+      FMulAdd,
     };
     CombinedOpcode CombinedOp = NotCombinedOp;
 
@@ -4033,6 +4034,9 @@ private:
     /// Returns true if any scalar in the list is a copyable element.
     bool hasCopyableElements() const { return !CopyableElements.empty(); }
 
+    /// Returns the state of the operations.
+    const InstructionsState &getOperations() const { return S; }
+
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
     /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
     unsigned findLaneForValue(Value *V) const {
@@ -11987,6 +11991,81 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
   }
 }
 
+static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
+                                       const InstructionsState &S,
+                                       DominatorTree &DT, const DataLayout &DL,
+                                       TargetTransformInfo &TTI,
+                                       const TargetLibraryInfo &TLI) {
+  assert(all_of(VL,
+                [](Value *V) {
+                  return V->getType()->getScalarType()->isFloatingPointTy();
+                }) &&
+         "Can only convert to FMA for floating point types");
+  assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
+
+  auto CheckForContractable = [&](ArrayRef<Value *> VL) {
+    FastMathFlags FMF;
+    FMF.set();
+    for (Value *V : VL) {
+      auto *I = dyn_cast<Instruction>(V);
+      if (!I)
+        continue;
+      // TODO: support for copyable elements.
+      Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
+      if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
+        continue;
+      if (auto *FPCI = dyn_cast<FPMathOperator>(I))
+        FMF &= FPCI->getFastMathFlags();
+    }
+    return FMF.allowContract();
+  };
+  if (!CheckForContractable(VL))
+    return InstructionCost::getInvalid();
+  // fmul also should be contractable
+  InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
+  SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
+
+  InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
+  if (!OpS.valid())
+    return InstructionCost::getInvalid();
+  if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
+    return InstructionCost::getInvalid();
+  if (!CheckForContractable(Operands.front()))
+    return InstructionCost::getInvalid();
+  // Compare the costs.
+  InstructionCost FMulPlusFAddCost = 0;
+  InstructionCost FMACost = 0;
+  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  FastMathFlags FMF;
+  FMF.set();
+  for (Value *V : VL) {
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      continue;
+    if (auto *FPCI = dyn_cast<FPMathOperator>(I))
+      FMF &= FPCI->getFastMathFlags();
+    FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
+  }
+  unsigned NumOps = 0;
+  for (auto [V, Op] : zip(VL, Operands.front())) {
+    auto *I = dyn_cast<Instruction>(Op);
+    if (!I || !I->hasOneUse()) {
+      FMACost += TTI.getInstructionCost(cast<Instruction>(V), CostKind);
+      if (I)
+        FMACost += TTI.getInstructionCost(I, CostKind);
+      continue;
+    }
+    ++NumOps;
+    if (auto *FPCI = dyn_cast<FPMathOperator>(I))
+      FMF &= FPCI->getFastMathFlags();
+    FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
+  }
+  Type *Ty = VL.front()->getType();
+  IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
+  FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
+  return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
+}
+
 void BoUpSLP::transformNodes() {
   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   BaseGraphSize = VectorizableTree.size();
@@ -12355,6 +12434,25 @@ void BoUpSLP::transformNodes() {
       }
       break;
     }
+    case Instruction::FSub:
+    case Instruction::FAdd: {
+      // Check if possible to convert (a*b)+c to fma.
+      if (E.State != TreeEntry::Vectorize ||
+          !E.getOperations().isAddSubLikeOp())
+        break;
+      if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
+               .isValid())
+        break;
+      // This node is a fmuladd node.
+      E.CombinedOp = TreeEntry::FMulAdd;
+      TreeEntry *FMulEntry = getOperandEntry(&E, 0);
+      if (FMulEntry->UserTreeIndex &&
+          FMulEntry->State == TreeEntry::Vectorize) {
+        // The FMul node is part of the combined fmuladd node.
+        FMulEntry->State = TreeEntry::CombinedVectorize;
+      }
+      break;
+    }
     default:
       break;
     }
@@ -13587,6 +13685,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     }
     return IntrinsicCost;
   };
+  auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
+                                         Instruction *VI) {
+    InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
+    return Cost;
+  };
   switch (ShuffleOrOp) {
   case Instruction::PHI: {
     // Count reused scalars.
@@ -13927,6 +14030,30 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     };
     return GetCostDiff(GetScalarCost, GetVectorCost);
   }
+  case TreeEntry::FMulAdd: {
+    auto GetScalarCost = [&](unsigned Idx) {
+      if (isa<PoisonValue>(UniqueValues[Idx]))
+        return InstructionCost(TTI::TCC_Free);
+      return GetFMulAddCost(E->getOperations(),
+                            cast<Instruction>(UniqueValues[Idx]));
+    };
+    auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
+      FastMathFlags FMF;
+      FMF.set();
+      for (Value *V : E->Scalars) {
+        if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
+          FMF &= FPCI->getFastMathFlags();
+          if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
+            FMF &= FPCIOp->getFastMathFlags();
+        }
+      }
+      IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
+                                  {VecTy, VecTy, VecTy}, FMF);
+      InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
+      return VecCost + CommonCost;
+    };
+    return GetCostDiff(GetScalarCost, GetVectorCost);
+  }
   case Instruction::FNeg:
   case Instruction::Add:
   case Instruction::FAdd:
@@ -13964,8 +14091,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       }
       TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(Op1);
       TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(Op2);
-      return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
-                                         Op1Info, Op2Info, Operands);
+      InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
+          ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
+      if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
+          I && (ShuffleOrOp == Instruction::FAdd ||
+                ShuffleOrOp == Instruction::FSub)) {
+        InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
+        if (IntrinsicCost.isValid())
+          ScalarCost = IntrinsicCost;
+      }
+      return ScalarCost;
     };
     auto GetVectorCost = [=](InstructionCost CommonCost) {
       if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
@@ -22594,11 +22729,21 @@ public:
   /// Try to find a reduction tree.
   bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
                                  ScalarEvolution &SE, const DataLayout &DL,
-                                 const TargetLibraryInfo &TLI) {
+                                 const TargetLibraryInfo &TLI,
+                                 DominatorTree &DT, TargetTransformInfo &TTI) {
     RdxKind = HorizontalReduction::getRdxKind(Root);
     if (!isVectorizable(RdxKind, Root))
       return false;
 
+    // FMA reduction root - skip.
+    auto CheckForFMA = [&](Instruction *I) {
+      return RdxKind == RecurKind::FAdd &&
+             canConvertToFMA(I, getSameOpcode(I, TLI), DT, DL, TTI, TLI)
+                 .isValid();
+    };
+    if (CheckForFMA(Root))
+      return false;
+
     // Analyze "regular" integer/FP types for reductions - no target-specific
     // types or pointers.
     Type *Ty = Root->getType();
@@ -22636,7 +22781,7 @@ public:
         // Also, do not try to reduce const values, if the operation is not
         // foldable.
         if (!EdgeInst || Level > RecursionMaxDepth ||
-            getRdxKind(EdgeInst) != RdxKind ||
+            getRdxKind(EdgeInst) != RdxKind || CheckForFMA(EdgeInst) ||
             IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
             !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
             !isVectorizable(RdxKind, EdgeInst) ||
@@ -24205,13 +24350,13 @@ bool SLPVectorizerPass::vectorizeHorReduction(
   Stack.emplace(SelectRoot(), 0);
   SmallPtrSet<Value *, 8> VisitedInstrs;
   bool Res = false;
-  auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
+  auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
     if (R.isAnalyzedReductionRoot(Inst))
       return nullptr;
     if (!isReductionCandidate(Inst))
       return nullptr;
     HorizontalReduction HorRdx;
-    if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
+    if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI, *DT, *TTI))
       return nullptr;
     return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
   };
@@ -24277,6 +24422,12 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
 
   if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
     return false;
+  // Skip potential FMA candidates.
+  if ((I->getOpcode() == Instruction::FAdd ||
+       I->getOpcode() == Instruction::FSub) &&
+      canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
+          .isValid())
+    return false;
 
   Value *P = I->getParent();
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 8052e31..73babcc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1054,12 +1054,17 @@ void VPlan::execute(VPTransformState *State) {
 
 InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
   // For now only return the cost of the vector loop region, ignoring any other
-  // blocks, like the preheader or middle blocks.
+  // blocks, like the preheader or middle blocks, expect for checking them for
+  // recipes with invalid costs.
   InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx);
 
-  // If any instructions in the middle block are invalid return invalid.
-  // TODO: Remove once no VPlans with VF == vscale x 1 and first-order recurrences are created.
-  if (!getMiddleBlock()->cost(VF, Ctx).isValid())
+  // If the cost of the loop region is invalid or any recipe in the skeleton
+  // outside loop regions are invalid return an invalid cost.
+  if (!Cost.isValid() || any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(
+                                    vp_depth_first_shallow(getEntry())),
+                                [&VF, &Ctx](VPBasicBlock *VPBB) {
+                                  return !VPBB->cost(VF, Ctx).isValid();
+                                }))
     return InstructionCost::getInvalid();
 
   return Cost;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
index 113eb14..4db9db9 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
@@ -370,3 +370,175 @@ entry:
   %r = select i1 %c, i64 %a, i64 %ands
   ret i64 %r
 }
+
+; Test EOR.
+define i32 @test1_eor(i32 %a) {
+; CHECK-LABEL: test1_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor w8, w0, #0x400
+; CHECK-NEXT:    eor w0, w8, #0x200000
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i32 %a, 2098176
+  ret i32 %eor
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i32 @test2_eor(i32 %a) {
+; CHECK-LABEL: test2_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #135 // =0x87
+; CHECK-NEXT:    eor w0, w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i32 %a, 135
+  ret i32 %eor
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i32 @test3_eor(i32 %a) {
+; CHECK-LABEL: test3_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #33, lsl #16
+; CHECK-NEXT:    eor w0, w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i32 %a, 2163712
+  ret i32 %eor
+}
+
+define i64 @test4_eor(i64 %a) {
+; CHECK-LABEL: test4_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor x8, x0, #0x400
+; CHECK-NEXT:    eor x0, x8, #0x200000
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i64 %a, 2098176
+  ret i64 %eor
+}
+
+define i64 @test5_eor(i64 %a) {
+; CHECK-LABEL: test5_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor x8, x0, #0x4000
+; CHECK-NEXT:    eor x0, x8, #0x200000000
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i64 %a, 8589950976
+  ret i64 %eor
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i64 @test6_eor(i64 %a) {
+; CHECK-LABEL: test6_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #135 // =0x87
+; CHECK-NEXT:    eor x0, x0, x8
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i64 %a, 135
+  ret i64 %eor
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i64 @test7_eor(i64 %a) {
+; CHECK-LABEL: test7_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #33, lsl #16
+; CHECK-NEXT:    eor x0, x0, x8
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i64 %a, 2163712
+  ret i64 %eor
+}
+
+; Test ORR.
+define i32 @test1_orr(i32 %a) {
+; CHECK-LABEL: test1_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr w8, w0, #0x400
+; CHECK-NEXT:    orr w0, w8, #0x200000
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i32 %a, 2098176
+  ret i32 %orr
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i32 @test2_orr(i32 %a) {
+; CHECK-LABEL: test2_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #135 // =0x87
+; CHECK-NEXT:    orr w0, w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i32 %a, 135
+  ret i32 %orr
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i32 @test3_orr(i32 %a) {
+; CHECK-LABEL: test3_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #33, lsl #16
+; CHECK-NEXT:    orr w0, w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i32 %a, 2163712
+  ret i32 %orr
+}
+
+define i64 @test4_orr(i64 %a) {
+; CHECK-LABEL: test4_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr x8, x0, #0x400
+; CHECK-NEXT:    orr x0, x8, #0x200000
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i64 %a, 2098176
+  ret i64 %orr
+}
+
+define i64 @test5_orr(i64 %a) {
+; CHECK-LABEL: test5_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr x8, x0, #0x4000
+; CHECK-NEXT:    orr x0, x8, #0x200000000
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i64 %a, 8589950976
+  ret i64 %orr
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i64 @test6_orr(i64 %a) {
+; CHECK-LABEL: test6_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #135 // =0x87
+; CHECK-NEXT:    orr x0, x0, x8
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i64 %a, 135
+  ret i64 %orr
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i64 @test7_orr(i64 %a) {
+; CHECK-LABEL: test7_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #33, lsl #16
+; CHECK-NEXT:    orr x0, x0, x8
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i64 %a, 2163712
+  ret i64 %orr
+}
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
index 9979e83..682b3b4 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
@@ -368,10 +368,7 @@ define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
 define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
 ; GCN-LABEL: test_clamp_bf16_folding:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_exp_bf16_e32 v0, v0
-; GCN-NEXT:    v_nop
-; GCN-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
-; GCN-NEXT:    v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT:    v_exp_bf16_e64 v0, v0 clamp
 ; GCN-NEXT:    ; return to shader part epilog
   %exp = call bfloat @llvm.exp2.bf16(bfloat %src)
   %max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
@@ -382,9 +379,7 @@ define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
 define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloat> %src1) {
 ; GCN-LABEL: test_clamp_v2bf16_folding:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_mul_bf16 v0, v0, v1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT:    v_pk_mul_bf16 v0, v0, v1 clamp
 ; GCN-NEXT:    ; return to shader part epilog
   %mul = fmul <2 x bfloat> %src0, %src1
   %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %mul, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 1b092b2..5674ae3 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -349,29 +349,24 @@ define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v2, v3, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_xor_select_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v2, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor i32 %arg0, -2147483648
   %select0 = select i1 %cond0, i32 %arg1, i32 %fneg0
@@ -550,31 +545,25 @@ define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, -v3, v5, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, -v2, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_xor_select_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 1, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, -v3, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, -v2, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor i64 %arg0, 9223372036854775808
   %select0 = select i1 %cond0, i64 %arg1, i64 %fneg0
diff --git a/llvm/test/CodeGen/AMDGPU/integer-canonicalizing-src-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-canonicalizing-src-modifiers.ll
new file mode 100644
index 0000000..005c8c8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/integer-canonicalizing-src-modifiers.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+
+; Demonstrate that the conversion of bitmasks affecting the sign bit on integers to srcmods
+; does not apply to canonicalizing instructions.
+
+define double @v_uitofp_i32_to_f64_abs(i32 %arg0) nounwind {
+; GCN-LABEL: v_uitofp_i32_to_f64_abs:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_i32_to_f64_abs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %arg0.abs = and i32 %arg0, u0x7fffffff
+  %cvt = uitofp i32 %arg0.abs to double
+  ret double %cvt
+}
+
+define double @v_uitofp_i32_to_f64_neg(i32 %arg0) nounwind {
+; GCN-LABEL: v_uitofp_i32_to_f64_neg:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_i32_to_f64_neg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %arg0.neg = and i32 %arg0, u0x80000000
+  %cvt = uitofp i32 %arg0.neg to double
+  ret double %cvt
+}
+
+define double @s_uitofp_i32_to_f64_abs(i32 inreg %arg0) nounwind {
+; GCN-LABEL: s_uitofp_i32_to_f64_abs:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_bitset0_b32 s16, 31
+; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], s16
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_uitofp_i32_to_f64_abs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_bitset0_b32 s0, 31
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %arg0.abs = and i32 %arg0, u0x7fffffff
+  %cvt = uitofp i32 %arg0.abs to double
+  ret double %cvt
+}
+
+define double @s_uitofp_i32_to_f64_neg(i32 inreg %arg0) nounwind {
+; GCN-LABEL: s_uitofp_i32_to_f64_neg:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s4, s16, 0x80000000
+; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], s4
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_uitofp_i32_to_f64_neg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s0, s0, 0x80000000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %arg0.neg = and i32 %arg0, u0x80000000
+  %cvt = uitofp i32 %arg0.neg to double
+  ret double %cvt
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-FAKE16: {{.*}}
+; GFX11-TRUE16: {{.*}}
+; GFX7: {{.*}}
+; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll
new file mode 100644
index 0000000..b3c7ac8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll
@@ -0,0 +1,1011 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+
+define i32 @fneg_select_i32_1(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fneg_select_i32_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, -v1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_i32_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, -v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i32 %a, u0x80000000
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %neg.a, i32 %b
+  ret i32 %select
+}
+
+define i32 @fneg_select_i32_2(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fneg_select_i32_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v1, v2, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_i32_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v1, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i32 %a, u0x80000000
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %b, i32 %neg.a
+  ret i32 %select
+}
+
+define i32 @fneg_select_i32_both(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fneg_select_i32_both:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v2, -v1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_i32_both:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v2, -v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i32 %a, u0x80000000
+  %neg.b = xor i32 %b, u0x80000000
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %neg.a, i32 %neg.b
+  ret i32 %select
+}
+
+define i32 @fneg_1_fabs_2_select_i32(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fneg_1_fabs_2_select_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, |v1|, -v1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_1_fabs_2_select_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, |v1|, -v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i32 %a, u0x80000000
+  %abs.b = and i32 %a, u0x7fffffff
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %neg.a, i32 %abs.b
+  ret i32 %select
+}
+
+define i32 @s_fneg_select_i32_1(i32 inreg %cond, i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: s_fneg_select_i32_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_xor_b32 s4, s17, 0x80000000
+; GCN-NEXT:    s_cmp_eq_u32 s16, 0
+; GCN-NEXT:    s_cselect_b32 s4, s4, s18
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_select_i32_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_b32 s1, s1, 0x80000000
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX11-NEXT:    s_cselect_b32 s0, s1, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i32 %a, u0x80000000
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %neg.a, i32 %b
+  ret i32 %select
+}
+
+define i32 @s_fneg_1_fabs_2_select_i32(i32 inreg %cond, i32 %a, i32 %b) {
+; GCN-LABEL: s_fneg_1_fabs_2_select_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s16, 0
+; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, |v0|, -v0, s[4:5]
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_1_fabs_2_select_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, |v0|, -v0, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i32 %a, u0x80000000
+  %abs.b = and i32 %a, u0x7fffffff
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %neg.a, i32 %abs.b
+  ret i32 %select
+}
+
+define <2 x i32> @fneg_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GCN-LABEL: fneg_select_v2i32_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, -v2, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, -v3, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_v2i32_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, -v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, -v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000)
+  %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+  %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b
+  ret <2 x i32> %select
+}
+
+define <2 x i32> @fneg_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GCN-LABEL: fneg_select_v2i32_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v2, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -v3, v5, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_v2i32_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v2, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -v3, v5, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000)
+  %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+  %select = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> %neg.a
+  ret <2 x i32> %select
+}
+
+define i32 @fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fabs_select_i32_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, |v1|, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fabs_select_i32_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, |v1|, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = and i32 %a, u0x7fffffff
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %neg.a, i32 %b
+  ret i32 %select
+}
+
+define i32 @fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fabs_select_i32_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, |v1|, v2, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fabs_select_i32_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, |v1|, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = and i32 %a, u0x7fffffff
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %b, i32 %neg.a
+  ret i32 %select
+}
+
+define <2 x i32> @fneg_1_fabs_2_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GCN-LABEL: fneg_1_fabs_2_select_v2i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v2, |v2|, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -v3, |v3|, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_1_fabs_2_select_v2i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v2, |v2|, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -v3, |v3|, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000)
+  %abs.b = and <2 x i32> %a, splat (i32 u0x7fffffff)
+  %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+  %select = select <2 x i1> %cmp, <2 x i32> %abs.b, <2 x i32> %neg.a
+  ret <2 x i32> %select
+}
+
+define i32 @fneg_fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fneg_fabs_select_i32_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, -|v1|, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_i32_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, -|v1|, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = or i32 %a, u0x80000000
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %neg.a, i32 %b
+  ret i32 %select
+}
+
+define i32 @fneg_fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fneg_fabs_select_i32_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -|v1|, v2, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_i32_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -|v1|, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = or i32 %a, u0x80000000
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %b, i32 %neg.a
+  ret i32 %select
+}
+
+define <2 x i32> @fneg_fabs_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GCN-LABEL: fneg_fabs_select_v2i32_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, -|v2|, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, -|v3|, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_v2i32_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, -|v2|, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, -|v3|, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = or <2 x i32> %a, splat (i32 u0x80000000)
+  %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+  %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b
+  ret <2 x i32> %select
+}
+
+define <2 x i32> @fneg_fabs_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GCN-LABEL: fneg_fabs_select_v2i32_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -|v2|, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -|v3|, v5, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_v2i32_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -|v2|, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -|v3|, v5, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = or <2 x i32> %a, splat (i32 u0x80000000)
+  %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+  %select = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> %neg.a
+  ret <2 x i32> %select
+}
+
+
+define <2 x i32> @s_fneg_select_v2i32_1(<2 x i32> inreg %cond, <2 x i32> inreg %a, <2 x i32> inreg %b) {
+; GCN-LABEL: s_fneg_select_v2i32_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_xor_b32 s4, s19, 0x80000000
+; GCN-NEXT:    s_xor_b32 s5, s18, 0x80000000
+; GCN-NEXT:    s_cmp_eq_u32 s16, 0
+; GCN-NEXT:    s_cselect_b32 s5, s5, s20
+; GCN-NEXT:    s_cmp_eq_u32 s17, 0
+; GCN-NEXT:    s_cselect_b32 s4, s4, s21
+; GCN-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_select_v2i32_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT:    s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX11-NEXT:    s_cselect_b32 s0, s2, s16
+; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX11-NEXT:    s_cselect_b32 s1, s3, s17
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000)
+  %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+  %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b
+  ret <2 x i32> %select
+}
+
+define <2 x i32> @s_fneg_fabs_select_v2i32_2(<2 x i32> inreg %cond, <2 x i32> inreg %a, <2 x i32> inreg %b) {
+; GCN-LABEL: s_fneg_fabs_select_v2i32_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_bitset1_b32 s19, 31
+; GCN-NEXT:    s_bitset1_b32 s18, 31
+; GCN-NEXT:    s_cmp_eq_u32 s16, 0
+; GCN-NEXT:    s_cselect_b32 s4, s20, s18
+; GCN-NEXT:    s_cmp_eq_u32 s17, 0
+; GCN-NEXT:    s_cselect_b32 s5, s21, s19
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_fabs_select_v2i32_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_bitset1_b32 s3, 31
+; GFX11-NEXT:    s_bitset1_b32 s2, 31
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX11-NEXT:    s_cselect_b32 s0, s16, s2
+; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX11-NEXT:    s_cselect_b32 s1, s17, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = or <2 x i32> %a, splat (i32 u0x80000000)
+  %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+  %select = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> %neg.a
+  ret <2 x i32> %select
+}
+
+define i64 @fneg_select_i64_1(i64 %cond, i64 %a, i64 %b) {
+; GCN-LABEL: fneg_select_i64_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, -v3, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_i64_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, -v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i64 %a, u0x8000000000000000
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %neg.a, i64 %b
+  ret i64 %select
+}
+
+define i64 @fneg_select_i64_2(i64 %cond, i64 %a, i64 %b) {
+; GCN-LABEL: fneg_select_i64_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -v3, v5, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_i64_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -v3, v5, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i64 %a, u0x8000000000000000
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %b, i64 %neg.a
+  ret i64 %select
+}
+
+define i64 @fneg_1_fabs_2_select_i64(i64 %cond, i64 %a, i64 %b) {
+; GCN-LABEL: fneg_1_fabs_2_select_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, |v5|, -v3, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_1_fabs_2_select_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, |v5|, -v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i64 %a, u0x8000000000000000
+  %abs.b = and i64 %b, u0x7fffffffffffffff
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %neg.a, i64 %abs.b
+  ret i64 %select
+}
+
+define i64 @fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
+; GCN-LABEL: fabs_select_i64_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, |v3|, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fabs_select_i64_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, |v3|, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = and i64 %a, u0x7fffffffffffffff
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %neg.a, i64 %b
+  ret i64 %select
+}
+
+define i64 @fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
+; GCN-LABEL: fabs_select_i64_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, |v3|, v5, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fabs_select_i64_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, |v3|, v5, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = and i64 %a, u0x7fffffffffffffff
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %b, i64 %neg.a
+  ret i64 %select
+}
+
+define i64 @fneg_fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
+; GCN-LABEL: fneg_fabs_select_i64_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, -|v3|, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_i64_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, -|v3|, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = or i64 %a, u0x8000000000000000
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %neg.a, i64 %b
+  ret i64 %select
+}
+
+define i64 @fneg_fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
+; GCN-LABEL: fneg_fabs_select_i64_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -|v3|, v5, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_i64_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -|v3|, v5, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = or i64 %a, u0x8000000000000000
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %b, i64 %neg.a
+  ret i64 %select
+}
+
+define i64 @s_fneg_select_i64_1(i64 inreg %cond, i64 inreg %a, i64 inreg %b) {
+; GFX7-LABEL: s_fneg_select_i64_1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[16:17], 0
+; GFX7-NEXT:    s_xor_b32 s6, s19, 0x80000000
+; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT:    s_cselect_b32 s4, s18, s20
+; GFX7-NEXT:    s_cselect_b32 s5, s6, s21
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_fneg_select_i64_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_xor_b32 s4, s19, 0x80000000
+; GFX9-NEXT:    s_cmp_eq_u64 s[16:17], 0
+; GFX9-NEXT:    s_cselect_b32 s5, s18, s20
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s21
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_select_i64_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; GFX11-NEXT:    s_cselect_b32 s0, s2, s16
+; GFX11-NEXT:    s_cselect_b32 s1, s3, s17
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i64 %a, u0x8000000000000000
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %neg.a, i64 %b
+  ret i64 %select
+}
+
+define i64 @s_fneg_select_i64_2(i64 inreg %cond, i64 inreg %a, i64 inreg %b) {
+; GFX7-LABEL: s_fneg_select_i64_2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[16:17], 0
+; GFX7-NEXT:    s_xor_b32 s6, s19, 0x80000000
+; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT:    s_cselect_b32 s4, s20, s18
+; GFX7-NEXT:    s_cselect_b32 s5, s21, s6
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_fneg_select_i64_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_xor_b32 s4, s19, 0x80000000
+; GFX9-NEXT:    s_cmp_eq_u64 s[16:17], 0
+; GFX9-NEXT:    s_cselect_b32 s5, s20, s18
+; GFX9-NEXT:    s_cselect_b32 s4, s21, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_select_i64_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; GFX11-NEXT:    s_cselect_b32 s0, s16, s2
+; GFX11-NEXT:    s_cselect_b32 s1, s17, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i64 %a, u0x8000000000000000
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %b, i64 %neg.a
+  ret i64 %select
+}
+
+define i64 @s_fneg_1_fabs_2_select_i64(i64 inreg %cond, i64 inreg %a, i64 inreg %b) {
+; GFX7-LABEL: s_fneg_1_fabs_2_select_i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[16:17], 0
+; GFX7-NEXT:    s_xor_b32 s6, s19, 0x80000000
+; GFX7-NEXT:    s_bitset0_b32 s21, 31
+; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT:    s_cselect_b32 s4, s18, s20
+; GFX7-NEXT:    s_cselect_b32 s5, s6, s21
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_fneg_1_fabs_2_select_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_xor_b32 s4, s19, 0x80000000
+; GFX9-NEXT:    s_bitset0_b32 s21, 31
+; GFX9-NEXT:    s_cmp_eq_u64 s[16:17], 0
+; GFX9-NEXT:    s_cselect_b32 s5, s18, s20
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s21
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_1_fabs_2_select_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT:    s_bitset0_b32 s17, 31
+; GFX11-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; GFX11-NEXT:    s_cselect_b32 s0, s2, s16
+; GFX11-NEXT:    s_cselect_b32 s1, s3, s17
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i64 %a, u0x8000000000000000
+  %abs.b = and i64 %b, u0x7fffffffffffffff
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %neg.a, i64 %abs.b
+  ret i64 %select
+}
+
+define i64 @s_fabs_select_i64_1(i64 inreg %cond, i64 inreg %a, i64 inreg %b) {
+; GFX7-LABEL: s_fabs_select_i64_1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[16:17], 0
+; GFX7-NEXT:    s_bitset0_b32 s19, 31
+; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT:    s_cselect_b32 s4, s18, s20
+; GFX7-NEXT:    s_cselect_b32 s5, s19, s21
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_fabs_select_i64_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_bitset0_b32 s19, 31
+; GFX9-NEXT:    s_cmp_eq_u64 s[16:17], 0
+; GFX9-NEXT:    s_cselect_b32 s4, s18, s20
+; GFX9-NEXT:    s_cselect_b32 s5, s19, s21
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fabs_select_i64_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_bitset0_b32 s3, 31
+; GFX11-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; GFX11-NEXT:    s_cselect_b32 s0, s2, s16
+; GFX11-NEXT:    s_cselect_b32 s1, s3, s17
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = and i64 %a, u0x7fffffffffffffff
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %neg.a, i64 %b
+  ret i64 %select
+}
+
+define i64 @s_fabs_select_i64_2(i64 inreg %cond, i64 inreg %a, i64 inreg %b) {
+; GFX7-LABEL: s_fabs_select_i64_2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[16:17], 0
+; GFX7-NEXT:    s_bitset0_b32 s19, 31
+; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT:    s_cselect_b32 s4, s20, s18
+; GFX7-NEXT:    s_cselect_b32 s5, s21, s19
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_fabs_select_i64_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_bitset0_b32 s19, 31
+; GFX9-NEXT:    s_cmp_eq_u64 s[16:17], 0
+; GFX9-NEXT:    s_cselect_b32 s4, s20, s18
+; GFX9-NEXT:    s_cselect_b32 s5, s21, s19
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fabs_select_i64_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_bitset0_b32 s3, 31
+; GFX11-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; GFX11-NEXT:    s_cselect_b32 s0, s16, s2
+; GFX11-NEXT:    s_cselect_b32 s1, s17, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = and i64 %a, u0x7fffffffffffffff
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %b, i64 %neg.a
+  ret i64 %select
+}
+
+define i64 @s_fneg_fabs_select_i64_1(i64 inreg %cond, i64 inreg %a, i64 inreg %b) {
+; GFX7-LABEL: s_fneg_fabs_select_i64_1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[16:17], 0
+; GFX7-NEXT:    s_bitset1_b32 s19, 31
+; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT:    s_cselect_b32 s4, s18, s20
+; GFX7-NEXT:    s_cselect_b32 s5, s19, s21
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_fneg_fabs_select_i64_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_bitset1_b32 s19, 31
+; GFX9-NEXT:    s_cmp_eq_u64 s[16:17], 0
+; GFX9-NEXT:    s_cselect_b32 s4, s18, s20
+; GFX9-NEXT:    s_cselect_b32 s5, s19, s21
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_fabs_select_i64_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_bitset1_b32 s3, 31
+; GFX11-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; GFX11-NEXT:    s_cselect_b32 s0, s2, s16
+; GFX11-NEXT:    s_cselect_b32 s1, s3, s17
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = or i64 %a, u0x8000000000000000
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %neg.a, i64 %b
+  ret i64 %select
+}
+
+define i64 @s_fneg_fabs_select_i64_2(i64 inreg %cond, i64 inreg %a, i64 inreg %b) {
+; GFX7-LABEL: s_fneg_fabs_select_i64_2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[16:17], 0
+; GFX7-NEXT:    s_bitset1_b32 s19, 31
+; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT:    s_cselect_b32 s4, s20, s18
+; GFX7-NEXT:    s_cselect_b32 s5, s21, s19
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_fneg_fabs_select_i64_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_bitset1_b32 s19, 31
+; GFX9-NEXT:    s_cmp_eq_u64 s[16:17], 0
+; GFX9-NEXT:    s_cselect_b32 s4, s20, s18
+; GFX9-NEXT:    s_cselect_b32 s5, s21, s19
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_fabs_select_i64_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_bitset1_b32 s3, 31
+; GFX11-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; GFX11-NEXT:    s_cselect_b32 s0, s16, s2
+; GFX11-NEXT:    s_cselect_b32 s1, s17, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = or i64 %a, u0x8000000000000000
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %b, i64 %neg.a
+  ret i64 %select
+}
+
+define i16 @fneg_select_i16_1(i16 %cond, i16 %a, i16 %b) {
+; GFX7-LABEL: fneg_select_i16_1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_select_i16_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: fneg_select_i16_1:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.h, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_select_i16_1:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i16 %a, u0x8000
+  %cmp = icmp eq i16 %cond, zeroinitializer
+  %select = select i1 %cmp, i16 %neg.a, i16 %b
+  ret i16 %select
+}
+
+define i16 @fneg_select_i16_2(i16 %cond, i16 %a, i16 %b) {
+; GFX7-LABEL: fneg_select_i16_2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_select_i16_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: fneg_select_i16_2:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.h, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_select_i16_2:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i16 %a, u0x8000
+  %cmp = icmp eq i16 %cond, zeroinitializer
+  %select = select i1 %cmp, i16 %b, i16 %neg.a
+  ret i16 %select
+}
+
+define i16 @fneg_select_i16_both(i16 %cond, i16 %a, i16 %b) {
+; GFX7-LABEL: fneg_select_i16_both:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_select_i16_both:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: fneg_select_i16_both:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_select_i16_both:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i16 %a, u0x8000
+  %neg.b = xor i16 %b, u0x8000
+  %cmp = icmp eq i16 %cond, zeroinitializer
+  %select = select i1 %cmp, i16 %neg.a, i16 %neg.b
+  ret i16 %select
+}
+
+define i16 @fneg_1_fabs_2_select_i16(i16 %cond, i16 %a, i16 %b) {
+; GFX7-LABEL: fneg_1_fabs_2_select_i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_1_fabs_2_select_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: fneg_1_fabs_2_select_i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.h, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0x7fff, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_1_fabs_2_select_i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i16 %a, u0x8000
+  %abs.b = and i16 %a, u0x7fff
+  %cmp = icmp eq i16 %cond, zeroinitializer
+  %select = select i1 %cmp, i16 %neg.a, i16 %abs.b
+  ret i16 %select
+}
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
index 1b2eb83..4393172 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
@@ -74,10 +74,11 @@ define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt(bfloat %src0, b
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_pk_max_num_bf16 v0, v0, v0 clamp
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+
+
+
   %src0.ext = fpext bfloat %src0 to float
   %src1.ext = fpext bfloat %src1 to float
   %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
@@ -191,10 +192,11 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt(<2 x bfloat> %src0, <2 x bflo
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
-; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_pk_max_num_bf16 v0, v0, v0 clamp
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1 clamp
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+
+
+
   %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
   %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
   %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
@@ -247,12 +249,12 @@ define <4 x bfloat> @v_mad_mix_v4f32_clamp_postcvt(<4 x bfloat> %src0, <4 x bflo
 ; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[6:7], v[0:1], v[2:3]
 ; GFX1250-NEXT:    v_pk_fma_f32 v[2:3], v[8:9], v[10:11], v[12:13]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
-; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v2, v3
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_pk_max_num_bf16 v0, v0, v0 clamp
-; GFX1250-NEXT:    v_pk_max_num_bf16 v1, v1, v1 clamp
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1 clamp
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v2, v3 clamp
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+
+
+
   %src0.ext = fpext <4 x bfloat> %src0 to <4 x float>
   %src1.ext = fpext <4 x bfloat> %src1 to <4 x float>
   %src2.ext = fpext <4 x bfloat> %src2 to <4 x float>
diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index 019eb2c..4995ce6 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -124,9 +124,8 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-NEXT:    v_add_i32_e64 v1, s[4:5], v0, v1
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v1, -v0, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_i32:
@@ -136,9 +135,8 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, -v0, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_i32:
@@ -383,16 +381,14 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v0, v2
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, -v0, s[4:5]
 ; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v1, v3
 ; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, -v1, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v2i32:
@@ -402,16 +398,14 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, -v0, s[4:5]
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v1, v3
 ; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, -v1, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v2i32:
@@ -442,8 +436,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_i64:
@@ -456,8 +449,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_i64:
@@ -470,8 +462,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_saddsat_i64:
@@ -480,12 +471,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[2:3]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
+; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_saddsat_i64:
@@ -494,11 +484,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[2:3]
-; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
+; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index 40d80f5..09c0e77 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -124,9 +124,8 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v1, s[4:5], v0, v1
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v1, -v0, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_i32:
@@ -136,9 +135,8 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, -v0, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i32:
@@ -383,16 +381,14 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v0, v2
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, -v0, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v1, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, -v1, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v2i32:
@@ -402,16 +398,14 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, -v0, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v1, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, -v1, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v2i32:
@@ -439,23 +433,20 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v0, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v3, -v0, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v1, v4
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v3
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, -v1, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v2, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
-; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v3, -v2, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v3i32:
@@ -465,23 +456,20 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v0, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, -v0, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v1, v4
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v3
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, -v1, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v2, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v2
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, -v2, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v3i32:
@@ -511,30 +499,26 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v0, v4
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v4, -v0, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v1, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v4, -v1, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v2, v6
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v4
-; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, -v2, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v3, v7
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
-; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v4, -v3, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v4i32:
@@ -544,30 +528,26 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v0, v4
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, -v0, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v1, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, -v1, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v2, v6
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v2
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v4
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v4, -v2, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v7
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
-; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v4, -v3, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v4i32:
@@ -599,58 +579,50 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v0, v8
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v8, -v0, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v1, v9
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v8, -v1, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v2, v10
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v8, -v2, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v3, v11
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v8, -v3, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v4, v12
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, -v4, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v13
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, v8, -v5, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v6, v14
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v6, v8, -v6, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v7, v15
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v7, v8, -v7, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v8i32:
@@ -660,58 +632,50 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v0, v8
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v8, -v0, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v1, v9
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v8, -v1, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v2, v10
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v2
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, -v2, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v3, v11
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v8, -v3, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v4, v12
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v8, -v4, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v5, v13
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v8, -v5, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v6, v14
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v8, -v6, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v7, v15
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v8, -v7, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v8i32:
@@ -751,116 +715,100 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v0, v16
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v16, -v0, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v1, v17
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v17
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v16, -v1, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v2, v18
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v16, -v2, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v3, v19
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v4, v20
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
-; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v16, -v3, s[4:5]
 ; GFX6-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v4, v20
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v4
+; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v17
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v17, -v4, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v5, v21
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v5
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, v17, -v5, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v6, v22
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, v17, v6, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v6, v17, -v6, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v7, v23
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v7
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, v17, v7, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v7, v17, -v7, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v8, v24
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v8
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, v17, -v8, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v9, v25
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v9
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v9, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v9, 0x80000000, v9
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v9, v17, v9, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v9, v17, -v9, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v10, v26
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v10
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v10, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v10, 0x80000000, v10
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, v17, v10, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v10, v17, -v10, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v11, v27
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v11
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v11, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v11, 0x80000000, v11
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v11, v17, v11, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v11, v17, -v11, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v12, v28
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v12
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v12, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v12, 0x80000000, v12
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v12, v17, v12, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v12, v17, -v12, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v13, v29
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v13
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v13, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v13, 0x80000000, v13
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v13, v17, v13, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v13, v17, -v13, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v14, v30
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v14
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v14, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v14, 0x80000000, v14
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v14, v17, v14, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v14, v17, -v14, s[4:5]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v16
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v15, v16
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v15, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v15, 0x80000000, v15
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v15, v16, -v15, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v16i32:
@@ -870,116 +818,100 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v0, v16
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v16, -v0, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v1, v17
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v17
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v16, -v1, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v2, v18
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v2
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v16, -v2, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v3, v19
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v4, v20
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
-; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v16, -v3, s[4:5]
 ; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v4, v20
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v4
+; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v17
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v17, -v4, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v5, v21
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v5
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v17, -v5, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v6, v22
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v6
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v17, v6, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v17, -v6, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v7, v23
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v7
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v17, v7, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v17, -v7, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v8, v24
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v8
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v17, -v8, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v9, v25
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v9
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v9, 0x80000000, v9
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v9, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v17, -v9, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v10, v26
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v10
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v10, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v10, 0x80000000, v10
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v10, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v17, -v10, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v11, v27
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v11
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v11, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v11, 0x80000000, v11
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v11, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v17, -v11, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v12, v28
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v12
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v12, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v12, 0x80000000, v12
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v12, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v17, -v12, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v13, v29
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v13
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v13, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v13, 0x80000000, v13
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v13, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v17, -v13, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v14, v30
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v14
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v14, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v14, 0x80000000, v14
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v14, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v17, -v14, s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v16
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v15, v16
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v15, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v15, 0x80000000, v15
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v16, -v15, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v16i32:
@@ -1066,8 +998,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_i64:
@@ -1080,8 +1011,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i64:
@@ -1094,8 +1024,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ssubsat_i64:
@@ -1104,12 +1033,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[2:3]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
+; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ssubsat_i64:
@@ -1118,11 +1046,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[2:3]
-; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
+; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
diff --git a/llvm/test/CodeGen/AVR/cmp.ll b/llvm/test/CodeGen/AVR/cmp.ll
index efc9b8d..c932bda1 100644
--- a/llvm/test/CodeGen/AVR/cmp.ll
+++ b/llvm/test/CodeGen/AVR/cmp.ll
@@ -298,3 +298,18 @@ define i16 @cmp_i16_gt_1023(i16 %0) {
   %3 = zext i1 %2 to i16
   ret i16 %3
 }
+
+define void @cmp_issue152097(i16 %a) addrspace(1) {
+; See: https://github.com/llvm/llvm-project/issues/152097
+; CHECK-LABEL: cmp_issue152097
+; CHECK:      ldi r18, -1
+; CHECK-NEXT: cpi r24, -2
+; CHECK-NEXT: cpc r25, r18
+; CHECK-NEXT: ret
+  %cmp = icmp ugt i16 -2, %a
+  br i1 %cmp, label %if.then, label %if.else
+if.then:
+  ret void
+if.else:
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/Binding/binding-overlap-7.ll b/llvm/test/CodeGen/DirectX/Binding/binding-overlap-7.ll
new file mode 100644
index 0000000..25f81dd
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/Binding/binding-overlap-7.ll
@@ -0,0 +1,35 @@
+; Use llc for this test so that we don't abort after the first error.
+; RUN: not llc %s -o /dev/null 2>&1 | FileCheck %s
+
+; Check that there is no overlap with unbounded array in different space
+
+  ; Buffer<double> A[2] : register(t2, space4);
+  ; Buffer<double> B : register(t20, space5);  // does not overlap
+  ; Buffer<double> C[] : register(t2, space4); // overlaps with A
+
+; CHECK: error: resource A at register 2 overlaps with resource C at register 2 in space 4
+; CHECK-NOT: error: resource C at register 2 overlaps with resource B at register 20 in space 5
+
+target triple = "dxil-pc-shadermodel6.3-library"
+
+@A.str = private unnamed_addr constant [2 x i8] c"A\00", align 1
+@B.str = private unnamed_addr constant [2 x i8] c"B\00", align 1
+@C.str = private unnamed_addr constant [2 x i8] c"C\00", align 1
+
+define void @test_not_overlapping_in_different_spaces() {
+entry:
+
+  ; Buffer<double> A[2] : register(t2, space4);
+  %h0 = call target("dx.TypedBuffer", double, 0, 0, 0)
+            @llvm.dx.resource.handlefrombinding(i32 4, i32 2, i32 2, i32 10, i1 false, ptr @A.str)
+
+  ; Buffer<double> B : register(t20, space5);
+  %h1 = call target("dx.TypedBuffer", i64, 0, 0, 0)
+            @llvm.dx.resource.handlefrombinding(i32 5, i32 20, i32 1, i32 0, i1 false, ptr @B.str)
+
+  ; Buffer<double> C[] : register(t2, space4);
+  %h2 = call target("dx.TypedBuffer", double, 0, 0, 0)
+            @llvm.dx.resource.handlefrombinding(i32 4, i32 2, i32 -1, i32 10, i1 false, ptr @C.str)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/imad.ll b/llvm/test/CodeGen/DirectX/imad.ll
index 5d9463d..2e612f0 100644
--- a/llvm/test/CodeGen/DirectX/imad.ll
+++ b/llvm/test/CodeGen/DirectX/imad.ll
@@ -1,17 +1,13 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower < %s | FileCheck %s
 
 ; Make sure dxil operation function calls for round are generated for float and half.
-; CHECK:call i16 @dx.op.tertiary.i16(i32 48, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR:]]
-; CHECK:call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
-; CHECK:call i64 @dx.op.tertiary.i64(i32 48, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
-
-; CHECK: attributes #[[#ATTR]] = {{{.*}} memory(none) {{.*}}}
 
 target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
 target triple = "dxil-pc-shadermodel6.7-library"
 ; Function Attrs: noinline nounwind optnone
 define noundef i16 @imad_short(i16 noundef %p0, i16 noundef %p1, i16 noundef %p2) #0 {
 entry:
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 48, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR:]]
   %p2.addr = alloca i16, align 2
   %p1.addr = alloca i16, align 2
   %p0.addr = alloca i16, align 2
@@ -31,6 +27,7 @@ declare i16 @llvm.dx.imad.i16(i16, i16, i16) #1
 ; Function Attrs: noinline nounwind optnone
 define noundef i32 @imad_int(i32 noundef %p0, i32 noundef %p1, i32 noundef %p2) #0 {
 entry:
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
   %p2.addr = alloca i32, align 4
   %p1.addr = alloca i32, align 4
   %p0.addr = alloca i32, align 4
@@ -50,6 +47,7 @@ declare i32 @llvm.dx.imad.i32(i32, i32, i32) #1
 ; Function Attrs: noinline nounwind optnone
 define noundef i64 @imad_int64(i64 noundef %p0, i64 noundef %p1, i64 noundef %p2) #0 {
 entry:
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 48, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
   %p2.addr = alloca i64, align 8
   %p1.addr = alloca i64, align 8
   %p0.addr = alloca i64, align 8
@@ -65,3 +63,95 @@ entry:
 
 ; Function Attrs: nocallback nofree nosync nounwind willreturn
 declare i64 @llvm.dx.imad.i64(i64, i64, i64) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef <4 x i16> @imad_int16_t4(<4 x i16> noundef %p0, <4 x i16> noundef %p1, <4 x i16> noundef %p2) #0 {
+entry:
+  ; CHECK: extractelement <4 x i16> %p0, i64 0
+  ; CHECK: extractelement <4 x i16> %p1, i64 0
+  ; CHECK: extractelement <4 x i16> %p2, i64 0
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 48, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i16> %p0, i64 1
+  ; CHECK: extractelement <4 x i16> %p1, i64 1
+  ; CHECK: extractelement <4 x i16> %p2, i64 1
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 48, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i16> %p0, i64 2
+  ; CHECK: extractelement <4 x i16> %p1, i64 2
+  ; CHECK: extractelement <4 x i16> %p2, i64 2
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 48, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i16> %p0, i64 3
+  ; CHECK: extractelement <4 x i16> %p1, i64 3
+  ; CHECK: extractelement <4 x i16> %p2, i64 3
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 48, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR]]
+  ; CHECK: insertelement <4 x i16> poison, i16 %{{.*}}, i64 0
+  ; CHECK: insertelement <4 x i16> %{{.*}}, i16 %{{.*}}, i64 1
+  ; CHECK: insertelement <4 x i16> %{{.*}}, i16 %{{.*}}, i64 2
+  ; CHECK: insertelement <4 x i16> %{{.*}}, i16 %{{.*}}, i64 3
+  %dx.imad = call <4 x i16> @llvm.dx.imad.v4i16(<4 x i16> %p0, <4 x i16> %p1, <4 x i16> %p2)
+  ret <4 x i16> %dx.imad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare <4 x i16> @llvm.dx.imad.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef <4 x i32> @imad_int4(<4 x i32> noundef %p0, <4 x i32> noundef %p1, <4 x i32> noundef %p2) #0 {
+entry:
+  ; CHECK: extractelement <4 x i32> %p0, i64 0
+  ; CHECK: extractelement <4 x i32> %p1, i64 0
+  ; CHECK: extractelement <4 x i32> %p2, i64 0
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i32> %p0, i64 1
+  ; CHECK: extractelement <4 x i32> %p1, i64 1
+  ; CHECK: extractelement <4 x i32> %p2, i64 1
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i32> %p0, i64 2
+  ; CHECK: extractelement <4 x i32> %p1, i64 2
+  ; CHECK: extractelement <4 x i32> %p2, i64 2
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i32> %p0, i64 3
+  ; CHECK: extractelement <4 x i32> %p1, i64 3
+  ; CHECK: extractelement <4 x i32> %p2, i64 3
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
+  ; CHECK: insertelement <4 x i32> poison, i32 %{{.*}}, i64 0
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i64 1
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i64 2
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i64 3
+  %dx.imad = call <4 x i32> @llvm.dx.imad.v4i32(<4 x i32> %p0, <4 x i32> %p1, <4 x i32> %p2)
+  ret <4 x i32> %dx.imad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare <4 x i32> @llvm.dx.imad.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef <4 x i64> @imad_int64_t4(<4 x i64> noundef %p0, <4 x i64> noundef %p1, <4 x i64> noundef %p2) #0 {
+entry:
+  ; CHECK: extractelement <4 x i64> %p0, i64 0
+  ; CHECK: extractelement <4 x i64> %p1, i64 0
+  ; CHECK: extractelement <4 x i64> %p2, i64 0
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 48, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i64> %p0, i64 1
+  ; CHECK: extractelement <4 x i64> %p1, i64 1
+  ; CHECK: extractelement <4 x i64> %p2, i64 1
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 48, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i64> %p0, i64 2
+  ; CHECK: extractelement <4 x i64> %p1, i64 2
+  ; CHECK: extractelement <4 x i64> %p2, i64 2
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 48, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i64> %p0, i64 3
+  ; CHECK: extractelement <4 x i64> %p1, i64 3
+  ; CHECK: extractelement <4 x i64> %p2, i64 3
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 48, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
+  ; CHECK: insertelement <4 x i64> poison, i64 %{{.*}}, i64 0
+  ; CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i64 1
+  ; CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i64 2
+  ; CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i64 3
+  %dx.imad = call <4 x i64> @llvm.dx.imad.v4i64(<4 x i64> %p0, <4 x i64> %p1, <4 x i64> %p2)
+  ret <4 x i64> %dx.imad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare <4 x i64> @llvm.dx.imad.v4i64(<4 x i64>, <4 x i64>, <4 x i64>) #1
+
+; CHECK: attributes #[[#ATTR]] = {{{.*}} memory(none) {{.*}}}
diff --git a/llvm/test/CodeGen/DirectX/umad.ll b/llvm/test/CodeGen/DirectX/umad.ll
index 104d238..76516a2 100644
--- a/llvm/test/CodeGen/DirectX/umad.ll
+++ b/llvm/test/CodeGen/DirectX/umad.ll
@@ -1,17 +1,13 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower < %s | FileCheck %s
 
 ; Make sure dxil operation function calls for round are generated for float and half.
-; CHECK:call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR:]]
-; CHECK:call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
-; CHECK:call i64 @dx.op.tertiary.i64(i32 49, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
-
-; CHECK: attributes #[[#ATTR]] = {{{.*}} memory(none) {{.*}}}
 
 target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
 target triple = "dxil-pc-shadermodel6.7-library"
 ; Function Attrs: noinline nounwind optnone
 define noundef i16 @umad_ushort(i16 noundef %p0, i16 noundef %p1, i16 noundef %p2) #0 {
 entry:
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR:]]
   %p2.addr = alloca i16, align 2
   %p1.addr = alloca i16, align 2
   %p0.addr = alloca i16, align 2
@@ -31,6 +27,7 @@ declare i16 @llvm.dx.umad.i16(i16, i16, i16) #1
 ; Function Attrs: noinline nounwind optnone
 define noundef i32 @umad_uint(i32 noundef %p0, i32 noundef %p1, i32 noundef %p2) #0 {
 entry:
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
   %p2.addr = alloca i32, align 4
   %p1.addr = alloca i32, align 4
   %p0.addr = alloca i32, align 4
@@ -50,6 +47,7 @@ declare i32 @llvm.dx.umad.i32(i32, i32, i32) #1
 ; Function Attrs: noinline nounwind optnone
 define noundef i64 @umad_uint64(i64 noundef %p0, i64 noundef %p1, i64 noundef %p2) #0 {
 entry:
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 49, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
   %p2.addr = alloca i64, align 8
   %p1.addr = alloca i64, align 8
   %p0.addr = alloca i64, align 8
@@ -65,3 +63,95 @@ entry:
 
 ; Function Attrs: nocallback nofree nosync nounwind willreturn
 declare i64 @llvm.dx.umad.i64(i64, i64, i64) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef <4 x i16> @umad_uint16_t4(<4 x i16> noundef %p0, <4 x i16> noundef %p1, <4 x i16> noundef %p2) #0 {
+entry:
+  ; CHECK: extractelement <4 x i16> %p0, i64 0
+  ; CHECK: extractelement <4 x i16> %p1, i64 0
+  ; CHECK: extractelement <4 x i16> %p2, i64 0
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i16> %p0, i64 1
+  ; CHECK: extractelement <4 x i16> %p1, i64 1
+  ; CHECK: extractelement <4 x i16> %p2, i64 1
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i16> %p0, i64 2
+  ; CHECK: extractelement <4 x i16> %p1, i64 2
+  ; CHECK: extractelement <4 x i16> %p2, i64 2
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i16> %p0, i64 3
+  ; CHECK: extractelement <4 x i16> %p1, i64 3
+  ; CHECK: extractelement <4 x i16> %p2, i64 3
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR]]
+  ; CHECK: insertelement <4 x i16> poison, i16 %{{.*}}, i64 0
+  ; CHECK: insertelement <4 x i16> %{{.*}}, i16 %{{.*}}, i64 1
+  ; CHECK: insertelement <4 x i16> %{{.*}}, i16 %{{.*}}, i64 2
+  ; CHECK: insertelement <4 x i16> %{{.*}}, i16 %{{.*}}, i64 3
+  %dx.umad = call <4 x i16> @llvm.dx.umad.v4i16(<4 x i16> %p0, <4 x i16> %p1, <4 x i16> %p2)
+  ret <4 x i16> %dx.umad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare <4 x i16> @llvm.dx.umad.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef <4 x i32> @umad_uint4(<4 x i32> noundef %p0, <4 x i32> noundef %p1, <4 x i32> noundef %p2) #0 {
+entry:
+  ; CHECK: extractelement <4 x i32> %p0, i64 0
+  ; CHECK: extractelement <4 x i32> %p1, i64 0
+  ; CHECK: extractelement <4 x i32> %p2, i64 0
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i32> %p0, i64 1
+  ; CHECK: extractelement <4 x i32> %p1, i64 1
+  ; CHECK: extractelement <4 x i32> %p2, i64 1
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i32> %p0, i64 2
+  ; CHECK: extractelement <4 x i32> %p1, i64 2
+  ; CHECK: extractelement <4 x i32> %p2, i64 2
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i32> %p0, i64 3
+  ; CHECK: extractelement <4 x i32> %p1, i64 3
+  ; CHECK: extractelement <4 x i32> %p2, i64 3
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
+  ; CHECK: insertelement <4 x i32> poison, i32 %{{.*}}, i64 0
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i64 1
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i64 2
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i64 3
+  %dx.umad = call <4 x i32> @llvm.dx.umad.v4i32(<4 x i32> %p0, <4 x i32> %p1, <4 x i32> %p2)
+  ret <4 x i32> %dx.umad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare <4 x i32> @llvm.dx.umad.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef <4 x i64> @umad_uint64_t4(<4 x i64> noundef %p0, <4 x i64> noundef %p1, <4 x i64> noundef %p2) #0 {
+entry:
+  ; CHECK: extractelement <4 x i64> %p0, i64 0
+  ; CHECK: extractelement <4 x i64> %p1, i64 0
+  ; CHECK: extractelement <4 x i64> %p2, i64 0
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 49, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i64> %p0, i64 1
+  ; CHECK: extractelement <4 x i64> %p1, i64 1
+  ; CHECK: extractelement <4 x i64> %p2, i64 1
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 49, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i64> %p0, i64 2
+  ; CHECK: extractelement <4 x i64> %p1, i64 2
+  ; CHECK: extractelement <4 x i64> %p2, i64 2
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 49, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i64> %p0, i64 3
+  ; CHECK: extractelement <4 x i64> %p1, i64 3
+  ; CHECK: extractelement <4 x i64> %p2, i64 3
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 49, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
+  ; CHECK: insertelement <4 x i64> poison, i64 %{{.*}}, i64 0
+  ; CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i64 1
+  ; CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i64 2
+  ; CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i64 3
+  %dx.umad = call <4 x i64> @llvm.dx.umad.v4i64(<4 x i64> %p0, <4 x i64> %p1, <4 x i64> %p2)
+  ret <4 x i64> %dx.umad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare <4 x i64> @llvm.dx.umad.v4i64(<4 x i64>, <4 x i64>, <4 x i64>) #1
+
+; CHECK: attributes #[[#ATTR]] = {{{.*}} memory(none) {{.*}}}
diff --git a/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll b/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll
new file mode 100644
index 0000000..3efe9be
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll
@@ -0,0 +1,80 @@
+; RUN: opt < %s -S -passes=infer-address-spaces | FileCheck %s --check-prefix=INFER
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck %s --check-prefix=PTX
+; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %}
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+@constant_tensormap = addrspace(4) global [64 x i8] zeroinitializer, align 64
+
+; Inference from const address space
+define void @test_infer_const_from_cast() {
+; INFER-LABEL: @test_infer_const_from_cast
+; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
+; BOTH: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
+; PTX-LABEL: .visible .func test_infer_const_from_cast(
+; PTX: mov.b64 %rd{{[0-9]+}}, constant_tensormap;
+; PTX: cvta.const.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}};
+; PTX: prefetch.tensormap [%rd{{[0-9]+}}];
+entry:
+  %casted = addrspacecast ptr addrspace(4) @constant_tensormap to ptr
+  call void @llvm.nvvm.prefetch.tensormap.p0(ptr %casted)
+  ret void
+}
+
+; Cast from Const space to Generic
+define void @test_const_to_generic_cast(ptr addrspace(4) %const_ptr) {
+; INFER-LABEL: @test_const_to_generic_cast
+; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
+; PTX-LABEL: .visible .func test_const_to_generic_cast(
+; PTX: prefetch.const.tensormap [%rd{{[0-9]+}}];
+entry:
+  %cast = addrspacecast ptr addrspace(4) %const_ptr to ptr
+  call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast)
+  ret void
+}
+
+; No inference possible 
+define void @test_no_inference_possible(ptr %generic_ptr) {
+; INFER-LABEL: @test_no_inference_possible
+; INFER: call void @llvm.nvvm.prefetch.tensormap.p0(ptr %generic_ptr)
+; PTX-LABEL: .visible .func test_no_inference_possible(
+; PTX: prefetch.tensormap [%rd{{[0-9]+}}]; 
+entry:
+  call void @llvm.nvvm.prefetch.tensormap.p0(ptr %generic_ptr)
+  ret void
+}
+
+; Cast from Parameter space to Generic
+define void @test_param_to_generic_cast(ptr addrspace(101) %param_ptr) {
+; INFER-LABEL: @test_param_to_generic_cast
+; INFER: call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)
+; PTX-LABEL: .visible .func test_param_to_generic_cast(
+; PTX: prefetch.param.tensormap [%rd{{[0-9]+}}];
+entry:
+  %cast = addrspacecast ptr addrspace(101) %param_ptr to ptr
+  call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast)
+  ret void
+}
+
+; Multiple casts in sequence
+define void @test_infer_through_multiple_casts() {
+; INFER-LABEL: @test_infer_through_multiple_casts
+; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
+; PTX-LABEL: .visible .func test_infer_through_multiple_casts(
+; PTX: mov.b64 %rd{{[0-9]+}}, constant_tensormap;
+; PTX: cvta.const.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}};
+; PTX: prefetch.tensormap [%rd{{[0-9]+}}];
+entry:
+  %cast1 = addrspacecast ptr addrspace(4) @constant_tensormap to ptr
+  %cast2 = addrspacecast ptr %cast1 to ptr addrspace(4)
+  %cast3 = addrspacecast ptr addrspace(4) %cast2 to ptr
+  call void @llvm.nvvm.prefetch.tensormap(ptr %cast3)
+  ret void
+}
+
+declare void @llvm.nvvm.prefetch.tensormap.p0(ptr)
+declare void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4))
+declare void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101))
+
+
diff --git a/llvm/test/CodeGen/NVPTX/prefetch.ll b/llvm/test/CodeGen/NVPTX/prefetch.ll
index a64e4fe..862e26d 100644
--- a/llvm/test/CodeGen/NVPTX/prefetch.ll
+++ b/llvm/test/CodeGen/NVPTX/prefetch.ll
@@ -12,6 +12,10 @@ declare void  @llvm.nvvm.prefetch.local.L2(ptr addrspace(5) %local_ptr)
 declare void  @llvm.nvvm.prefetch.L1(ptr %ptr)
 declare void  @llvm.nvvm.prefetch.L2(ptr %ptr)
 
+declare void  @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr)
+declare void  @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
+declare void  @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)
+
 declare void  @llvm.nvvm.prefetch.global.L2.evict.normal(ptr addrspace(1) %global_ptr)
 declare void  @llvm.nvvm.prefetch.global.L2.evict.last(ptr addrspace(1) %global_ptr)
 
@@ -78,4 +82,43 @@ define void @prefetchu_l1(ptr %ptr) {
 ; CHECK-PTX64-NEXT:    ret;
   tail call void @llvm.nvvm.prefetchu.L1(ptr %ptr)
   ret void
+}
+
+define void @prefetch_tensormap(ptr %ptr) {
+; CHECK-PTX64-LABEL: prefetch_tensormap(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [prefetch_tensormap_param_0];
+; CHECK-PTX64-NEXT:    prefetch.tensormap [%rd1];
+; CHECK-PTX64-NEXT:    ret;
+  tail call void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr)
+  ret void
+}
+
+define void @prefetch_const_tensormap(ptr addrspace(4) %const_ptr) {
+; CHECK-PTX64-LABEL: prefetch_const_tensormap(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [prefetch_const_tensormap_param_0];
+; CHECK-PTX64-NEXT:    prefetch.const.tensormap [%rd1];
+; CHECK-PTX64-NEXT:    ret;
+  tail call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
+  ret void
+}
+
+define void @prefetch_param_tensormap(ptr addrspace(101) %param_ptr) {
+; CHECK-PTX64-LABEL: prefetch_param_tensormap(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [prefetch_param_tensormap_param_0];
+; CHECK-PTX64-NEXT:    prefetch.param.tensormap [%rd1];
+; CHECK-PTX64-NEXT:    ret;
+  tail call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)
+  ret void
 }
 \ No newline at end of file
diff --git a/llvm/test/CodeGen/PowerPC/memintr32.ll b/llvm/test/CodeGen/PowerPC/memintr32.ll
index c07a5af..4f0a996 100644
--- a/llvm/test/CodeGen/PowerPC/memintr32.ll
+++ b/llvm/test/CodeGen/PowerPC/memintr32.ll
@@ -11,7 +11,7 @@ define i32 @memcmp_test(ptr nocapture noundef readonly %ptr1, ptr nocapture noun
 ; CHECK-AIX-32-P9-NEXT:    mflr r0
 ; CHECK-AIX-32-P9-NEXT:    stwu r1, -64(r1)
 ; CHECK-AIX-32-P9-NEXT:    stw r0, 72(r1)
-; CHECK-AIX-32-P9-NEXT:    bl .memcmp[PR]
+; CHECK-AIX-32-P9-NEXT:    bl .___memcmp[PR]
 ; CHECK-AIX-32-P9-NEXT:    nop
 ; CHECK-AIX-32-P9-NEXT:    addi r1, r1, 64
 ; CHECK-AIX-32-P9-NEXT:    lwz r0, 8(r1)
diff --git a/llvm/test/CodeGen/PowerPC/memintr64.ll b/llvm/test/CodeGen/PowerPC/memintr64.ll
index b3a6650..0b0e556 100644
--- a/llvm/test/CodeGen/PowerPC/memintr64.ll
+++ b/llvm/test/CodeGen/PowerPC/memintr64.ll
@@ -39,7 +39,7 @@ define noundef i32 @_Z11memcmp_testPKvS0_m(ptr noundef readonly captures(none) %
 ; CHECK-AIX-64-P9-NEXT:    mflr r0
 ; CHECK-AIX-64-P9-NEXT:    stdu r1, -112(r1)
 ; CHECK-AIX-64-P9-NEXT:    std r0, 128(r1)
-; CHECK-AIX-64-P9-NEXT:    bl .memcmp[PR]
+; CHECK-AIX-64-P9-NEXT:    bl .___memcmp64[PR]
 ; CHECK-AIX-64-P9-NEXT:    nop
 ; CHECK-AIX-64-P9-NEXT:    addi r1, r1, 112
 ; CHECK-AIX-64-P9-NEXT:    ld r0, 16(r1)
diff --git a/llvm/test/CodeGen/WebAssembly/ref-test-func.ll b/llvm/test/CodeGen/WebAssembly/ref-test-func.ll
index ea2453f..4fda253 100644
--- a/llvm/test/CodeGen/WebAssembly/ref-test-func.ll
+++ b/llvm/test/CodeGen/WebAssembly/ref-test-func.ll
@@ -31,7 +31,7 @@ define void @test_fpsig_return_i32(ptr noundef %func) local_unnamed_addr #0 {
 ; CHECK-NEXT:    call use
 ; CHECK-NEXT:    # fallthrough-return
 entry:
-  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, i32 0)
+  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, i32 poison)
   tail call void @use(i32 noundef %res) #3
   ret void
 }
@@ -48,7 +48,7 @@ define void @test_fpsig_return_i64(ptr noundef %func) local_unnamed_addr #0 {
 ; CHECK-NEXT:    call use
 ; CHECK-NEXT:    # fallthrough-return
 entry:
-  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, i64 0)
+  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, i64 poison)
   tail call void @use(i32 noundef %res) #3
   ret void
 }
@@ -65,7 +65,7 @@ define void @test_fpsig_return_f32(ptr noundef %func) local_unnamed_addr #0 {
 ; CHECK-NEXT:    call use
 ; CHECK-NEXT:    # fallthrough-return
 entry:
-  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, float 0.)
+  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, float poison)
   tail call void @use(i32 noundef %res) #3
   ret void
 }
@@ -82,7 +82,7 @@ define void @test_fpsig_return_f64(ptr noundef %func) local_unnamed_addr #0 {
 ; CHECK-NEXT:    call use
 ; CHECK-NEXT:    # fallthrough-return
 entry:
-  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, double 0.)
+  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, double poison)
   tail call void @use(i32 noundef %res) #3
   ret void
 }
@@ -100,7 +100,7 @@ define void @test_fpsig_param_i32(ptr noundef %func) local_unnamed_addr #0 {
 ; CHECK-NEXT:    call use
 ; CHECK-NEXT:    # fallthrough-return
 entry:
-  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, token poison, double 0.)
+  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, token poison, double poison)
   tail call void @use(i32 noundef %res) #3
   ret void
 }
@@ -118,7 +118,7 @@ define void @test_fpsig_multiple_params_and_returns(ptr noundef %func) local_unn
 ; CHECK-NEXT:    call use
 ; CHECK-NEXT:    # fallthrough-return
 entry:
-  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, i32 0, i64 0, float 0., double 0., token poison, i64 0, float 0., i64 0)
+  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, i32 poison, i64 poison, float poison, double poison, token poison, i64 poison, float poison, i64 poison)
   tail call void @use(i32 noundef %res) #3
   ret void
 }
@@ -137,10 +137,26 @@ define void @test_fpsig_ptrs(ptr noundef %func) local_unnamed_addr #0 {
 ; CHECK-NEXT:    call use
 ; CHECK-NEXT:    # fallthrough-return
 entry:
-  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, ptr null, token poison, ptr null, ptr null)
+  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, ptr poison, token poison, ptr poison, ptr poison)
   tail call void @use(i32 noundef %res) #3
   ret void
 }
 
+define void @test_reference_types(ptr noundef %func) local_unnamed_addr #0 {
+; CHECK-LABEL: test_reference_types:
+; CHK32:         .functype test_reference_types (i32) -> ()
+; CHK64:         .functype test_reference_types (i64) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHK64-NEXT:    i32.wrap_i64
+; CHECK-NEXT:    table.get __indirect_function_table
+; CHECK-NEXT:    ref.test (funcref, externref) -> (externref)
+; CHECK-NEXT:    call use
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, ptr addrspace(10) poison, token poison, ptr addrspace(20) poison, ptr addrspace(10) poison)
+  tail call void @use(i32 noundef %res) #3
+  ret void
+}
 
 declare void @use(i32 noundef) local_unnamed_addr #1
diff --git a/llvm/test/Instrumentation/TypeSanitizer/alloca.ll b/llvm/test/Instrumentation/TypeSanitizer/alloca.ll
index c53b006..fc72631 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/alloca.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/alloca.ll
@@ -74,3 +74,56 @@ loop:
 exit:
   ret void
 }
+
+define void @dynamic_alloca_lifetime_test(i1 %c, i64 %n) sanitize_type {
+; CHECK-LABEL: @dynamic_alloca_lifetime_test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[X:%.*]] = alloca i32, i64 [[N:%.*]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[N]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP0]], 3
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 0, i64 [[TMP6]], i1 false)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[N]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 3
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP13:%.*]] = shl i64 [[TMP7]], 3
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP12]], i8 0, i64 [[TMP13]], i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 -1, ptr [[X]])
+; CHECK-NEXT:    call void @alloca_test_use(ptr [[X]])
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[N]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = and i64 [[TMP15]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shl i64 [[TMP16]], 3
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP20:%.*]] = shl i64 [[TMP14]], 3
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP19]], i8 0, i64 [[TMP20]], i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[X]])
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x = alloca i32, i64 %n, align 1
+  br label %loop
+
+loop:
+  call void @llvm.lifetime.start.p0(i64 -1, ptr %x)
+  call void @alloca_test_use(ptr %x)
+  call void @llvm.lifetime.end.p0(i64 -1, ptr %x)
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/MC/Disassembler/RISCV/riscv-mapping-symbols.s b/llvm/test/MC/Disassembler/RISCV/riscv-mapping-symbols.s
new file mode 100644
index 0000000..ff15008
--- /dev/null
+++ b/llvm/test/MC/Disassembler/RISCV/riscv-mapping-symbols.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc --triple=riscv32-unknown-none-elf %s -filetype=obj -o - \
+# RUN: | llvm-objdump -dr - \
+# RUN: | FileCheck %s
+# RUN: llvm-mc --triple=riscv64-unknown-none-elf %s -filetype=obj -o - \
+# RUN: | llvm-objdump -dr - \
+# RUN: | FileCheck %s
+
+
+  # CHECK: 00000013 nop
+  nop
+
+  # CHECK-NEXT: 55 55 55 55 .word 0x55555555
+  .word 0x55555555
+
+  # CHECK-NEXT: 00 00 00 00 .word 0x00000000
+  # CHECK-NEXT: R_RISCV_32 foo
+  .word foo
+
+  # CHECK-NEXT: 00000013 nop
+  nop
diff --git a/llvm/test/MC/ELF/many-instructions.s b/llvm/test/MC/ELF/many-instructions.s
index 843d35f..7c13c0d 100644
--- a/llvm/test/MC/ELF/many-instructions.s
+++ b/llvm/test/MC/ELF/many-instructions.s
@@ -1,4 +1,5 @@
-# REQUIRES: asserts
+## Checks the size of an internal MC structure that is different on 32-bit.
+# REQUIRES: asserts, llvm-64-bits
 # RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o /dev/null -debug-only=mc-dump 2>&1 | grep -E -o '[0-9]+ Data Size:[0-9]+' | FileCheck %s
 
 ## Test that encodeInstruction may cause a new fragment to be created.
diff --git a/llvm/test/MC/RISCV/large-instructions.s b/llvm/test/MC/RISCV/large-instructions.s
deleted file mode 100644
index b50dbde..0000000
--- a/llvm/test/MC/RISCV/large-instructions.s
+++ /dev/null
@@ -1,29 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
-# RUN:     | llvm-objdump -d - | FileCheck %s
-
-# CHECK: 011f 4523 8967 <unknown>
-.byte 0x1f, 0x01, 0x23, 0x45, 0x67, 0x89
-
-# CHECK: 4523013f cdab8967 <unknown>
-.byte 0x3f, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd
-
-# CHECK: 007f 4523 8967 cdab feef <unknown>
-.byte 0x7f, 0x00, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe
-
-# CHECK: 4523107f cdab8967 badcfeef <unknown>
-.byte 0x7f, 0x10, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba
-
-# CHECK: 207f 4523 8967 cdab feef badc 7698 <unknown>
-.byte 0x7f, 0x20, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76
-
-# CHECK: 4523307f cdab8967 badcfeef 32547698 <unknown>
-.byte 0x7f, 0x30, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32
-
-# CHECK: 407f 4523 8967 cdab feef badc 7698 3254 1210 <unknown>
-.byte 0x7f, 0x40, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0x12
-
-# CHECK: 4523507f cdab8967 badcfeef 32547698 56341210 <unknown>
-.byte 0x7f, 0x50, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0x12, 0x34, 0x56
-
-# CHECK: 607f 4523 8967 cdab feef badc 7698 3254 1210 5634 9a78 <unknown>
-.byte 0x7f, 0x60, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0x12, 0x34, 0x56, 0x78, 0x9a
diff --git a/llvm/test/MC/RISCV/large-instructions.test b/llvm/test/MC/RISCV/large-instructions.test
new file mode 100644
index 0000000..b8396a9
--- /dev/null
+++ b/llvm/test/MC/RISCV/large-instructions.test
@@ -0,0 +1,60 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objdump -d %t | FileCheck %s
+
+## This CHECKs objdump's handling of wide instruction encodings, and how it
+## groups the instruction bytes when disassembling.
+##
+## This is written in YAML because using `.byte` emits the wrong mapping
+## symbols.
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS32
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_RISCV
+  SectionHeaderStringTable: .strtab
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x1
+    ContentArray:    [
+      # CHECK: 011f 4523 8967 <unknown>
+      0x1f, 0x01, 0x23, 0x45, 0x67, 0x89,
+
+      # CHECK: 4523013f cdab8967 <unknown>
+      0x3f, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd,
+
+      # CHECK: 007f 4523 8967 cdab feef <unknown>
+      0x7f, 0x00, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe,
+
+      # CHECK: 4523107f cdab8967 badcfeef <unknown>
+      0x7f, 0x10, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba,
+
+      # CHECK: 207f 4523 8967 cdab feef badc 7698 <unknown>
+      0x7f, 0x20, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76,
+
+      # CHECK: 4523307f cdab8967 badcfeef 32547698 <unknown>
+      0x7f, 0x30, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32,
+
+      # CHECK: 407f 4523 8967 cdab feef badc 7698 3254 1210 <unknown>
+      0x7f, 0x40, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0x12,
+
+      # CHECK: 4523507f cdab8967 badcfeef 32547698 56341210 <unknown>
+      0x7f, 0x50, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0x12, 0x34, 0x56,
+
+      # CHECK: 607f 4523 8967 cdab feef badc 7698 3254 1210 5634 9a78 <unknown>
+      0x7f, 0x60, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0x12, 0x34, 0x56, 0x78, 0x9a,
+    ]
+
+  - Type:            SectionHeaderTable
+    Sections:
+      - Name:            .strtab
+      - Name:            .symtab
+      - Name:            .text
+Symbols:
+  - Name:    "$x"
+    Section: .text
+    Value:   0x0
+...
diff --git a/llvm/test/MC/RISCV/nop-slide.s b/llvm/test/MC/RISCV/nop-slide.s
index 4dc888b..a49ffdc 100644
--- a/llvm/test/MC/RISCV/nop-slide.s
+++ b/llvm/test/MC/RISCV/nop-slide.s
@@ -10,18 +10,15 @@
 auipc a0, 0
 
 # CHECK-RVC-NORELAX: 0000000000000000 <.text>:
-# CHECK-RVC-NORELAX-NEXT: 0: 0000      	unimp
-# CHECK-RVC-NORELAX-NEXT: 2: 0001      	nop
+# CHECK-RVC-NORELAX-NEXT: 0: 00 00 01 00 .word 0x00010000
 # CHECK-RVC-NORELAX-NEXT: 4: 00000517  	auipc	a0, 0x0
 
 # CHECK-RVC-RELAX: 0000000000000000 <.text>:
 # CHECK-RVC-RELAX-NEXT:   0: 0001      	nop
-# CHECK-RVC-RELAX-NEXT:   2: 0100      	addi	s0, sp, 0x80
-# CHECK-RVC-RELAX-NEXT:   4: 1700      	addi	s0, sp, 0x3a0
-# CHECK-RVC-RELAX-NEXT:   6: 0005      	c.nop	0x1
-# CHECK-RVC-RELAX-NEXT:   8: 00        	<unknown>
+# CHECK-RVC-RELAX-NEXT:   2: 00 01      .short 0x0100
+# CHECK-RVC-RELAX-NEXT:   4: 00         .byte 0x00
+# CHECK-RVC-RELAX-NEXT:   5: 00000517  	auipc	a0, 0x0
 
 # CHECK: 0000000000000000 <.text>:
-# CHECK-NEXT: 0: 0000      	<unknown>
-# CHECK-NEXT: 2: 0000      	<unknown>
+# CHECK-NEXT: 0: 00 00 00 00 .word 0x00000000
 # CHECK-NEXT: 4: 00000517  	auipc	a0, 0x0
diff --git a/llvm/test/MC/RISCV/rvv/vsetvl-invalid.s b/llvm/test/MC/RISCV/rvv/vsetvl-invalid.s
index b45f3f2..d97b538 100644
--- a/llvm/test/MC/RISCV/rvv/vsetvl-invalid.s
+++ b/llvm/test/MC/RISCV/rvv/vsetvl-invalid.s
@@ -4,37 +4,37 @@
 # RUN:     | llvm-objdump -d --mattr=+v - | FileCheck %s
 
 # CHECK: vsetvli a1, a0, e64, m1, tu, mu
-.word 0x018575d7
+.insn 4, 0x018575d7
 
 # CHECK: vsetvli a1, a0, 0x1c
-.word 0x01c575d7
+.insn 4, 0x01c575d7
 
 # CHECK: vsetvli a1, a0, 0x24
-.word 0x024575d7
+.insn 4, 0x024575d7
 
 # CHECK: vsetvli a1, a0, 0x29
-.word 0x029575d7
+.insn 4, 0x029575d7
 
 # CHECK: vsetvli a1, a0, 0x110
-.word 0x110575d7
+.insn 4, 0x110575d7
 
 # CHECK: vsetvli a1, a0, e64, mf8, tu, mu
-.word 0x01d575d7
+.insn 4, 0x01d575d7
 
 # CHECK: vsetivli a1, 0x10, e8, m4, tu, mu
-.word 0xc02875d7
+.insn 4, 0xc02875d7
 
 # CHECK: vsetivli a1, 0x10, 0xc
-.word 0xc0c875d7
+.insn 4, 0xc0c875d7
 
 # CHECK: vsetivli a1, 0x10, 0x14
-.word 0xc14875d7
+.insn 4, 0xc14875d7
 
 # CHECK: vsetivli a1, 0x10, 0x38
-.word 0xc38875d7
+.insn 4, 0xc38875d7
 
 # CHECK: vsetivli a1, 0x10, 0x103
-.word 0xd03875d7
+.insn 4, 0xd03875d7
 
 # CHECK: vsetivli a1, 0x10, e8, mf4, tu, mu
-.word 0xc06875d7
+.insn 4, 0xc06875d7
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr151664-cost-hoisted-vector-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr151664-cost-hoisted-vector-scalable.ll
index 8495dee..b4df63d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr151664-cost-hoisted-vector-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr151664-cost-hoisted-vector-scalable.ll
@@ -1,47 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph" --version 5
-; REQUIRES: asserts
-; RUN: opt -passes=loop-vectorize -mtriple=aarch64 -mattr=+sve -S \
-; RUN:   -debug-only=loop-vectorize %s 2>&1 | FileCheck %s
+; RUN: opt -passes=loop-vectorize -mtriple=aarch64 -mattr=+sve -S %s | FileCheck %s
 
-; FIXME: Hoisted vector code should be costed with scalable cost.
-; In this example, `<vscale x 4 x float> @llvm.minimumnum` has an invalid cost,
-; and hence should not be produced by LoopVectorize.
-
-; CHECK: LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction:   %res = tail call float @llvm.minimumnum.f32(float %arg, float 0.000000e+00)
 define void @cost_hoisted_vector_code(ptr %p, float %arg) {
 ; CHECK-LABEL: define void @cost_hoisted_vector_code(
 ; CHECK-SAME: ptr [[P:%.*]], float [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 -1, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 -1, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[ARG]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 1, [[N_VEC]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x float> @llvm.minimumnum.nxv4f32(<vscale x 4 x float> [[BROADCAST_SPLAT]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[ARG]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = add i64 1, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[P]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[TMP8]], i64 [[TMP10]]
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP7]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP7]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[TMP8]], i32 4
+; CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], -8
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 -1, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll
index 20bc0af..76a7536 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -mtriple=aarch64-none-elf -mcpu=cortex-a510 -mattr=+sve -passes=loop-vectorize -S | FileCheck %s --check-prefix=CHECK-CA510
 ; RUN: opt < %s -mtriple=aarch64-none-elf -mcpu=cortex-a520 -mattr=+sve -passes=loop-vectorize -S | FileCheck %s --check-prefix=CHECK-CA520
+; RUN: opt < %s -mtriple=aarch64-none-elf -mcpu=cortex-a320 -mattr=+sve -passes=loop-vectorize -S | FileCheck %s --check-prefix=CHECK-CA320
 
 define void @sve_add(ptr  %dst, ptr  %a, ptr  %b, i64 %n) {
 ; CHECK-CA510-LABEL: define void @sve_add(
@@ -131,6 +132,70 @@ define void @sve_add(ptr  %dst, ptr  %a, ptr  %b, i64 %n) {
 ; CHECK-CA520:       [[FOR_COND_CLEANUP]]:
 ; CHECK-CA520-NEXT:    ret void
 ;
+; CHECK-CA320-LABEL: define void @sve_add(
+; CHECK-CA320-SAME: ptr [[DST:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-CA320-NEXT:  [[ENTRY:.*:]]
+; CHECK-CA320-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-CA320-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-CA320-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-CA320-NEXT:    [[CMP9_NOT:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-CA320-NEXT:    br i1 [[CMP9_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK-CA320:       [[FOR_BODY_PREHEADER]]:
+; CHECK-CA320-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-CA320-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-CA320:       [[VECTOR_MEMCHECK]]:
+; CHECK-CA320-NEXT:    [[TMP0:%.*]] = sub i64 [[DST1]], [[A2]]
+; CHECK-CA320-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
+; CHECK-CA320-NEXT:    [[TMP1:%.*]] = sub i64 [[DST1]], [[B3]]
+; CHECK-CA320-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32
+; CHECK-CA320-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
+; CHECK-CA320-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-CA320:       [[VECTOR_PH]]:
+; CHECK-CA320-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-CA320-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-CA320-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-CA320:       [[VECTOR_BODY]]:
+; CHECK-CA320-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-CA320-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]]
+; CHECK-CA320-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i32 4
+; CHECK-CA320-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-CA320-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-CA320-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX]]
+; CHECK-CA320-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i32 4
+; CHECK-CA320-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-CA320-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; CHECK-CA320-NEXT:    [[TMP6:%.*]] = fadd fast <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD]]
+; CHECK-CA320-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD5]]
+; CHECK-CA320-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; CHECK-CA320-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 4
+; CHECK-CA320-NEXT:    store <4 x float> [[TMP6]], ptr [[TMP8]], align 4
+; CHECK-CA320-NEXT:    store <4 x float> [[TMP7]], ptr [[TMP9]], align 4
+; CHECK-CA320-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-CA320-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-CA320-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-CA320:       [[MIDDLE_BLOCK]]:
+; CHECK-CA320-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-CA320-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-CA320:       [[SCALAR_PH]]:
+; CHECK-CA320-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-CA320-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-CA320:       [[FOR_BODY]]:
+; CHECK-CA320-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-CA320-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-CA320-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-CA320-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-CA320-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-CA320-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP12]], [[TMP11]]
+; CHECK-CA320-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-CA320-NEXT:    store float [[ADD]], ptr [[ARRAYIDX4]], align 4
+; CHECK-CA320-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-CA320-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-CA320-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-CA320:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; CHECK-CA320-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK-CA320:       [[FOR_COND_CLEANUP]]:
+; CHECK-CA320-NEXT:    ret void
+;
 entry:
   %cmp9.not = icmp eq i64 %n, 0
   br i1 %cmp9.not, label %for.cond.cleanup, label %for.body
@@ -160,3 +225,8 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
 ; CHECK-CA520: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK-CA520: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
 ;.
+; CHECK-CA320: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-CA320: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-CA320: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-CA320: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
index ce7b78e..2b01018 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
@@ -1,81 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph\:" --version 5
 ; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
 define void @trip7_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
-; CHECK-LABEL: @trip7_i64(
-; CHECK:         = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    = mul nuw i64
-; CHECK:         [[VSCALE:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[VF:%.*]] = mul nuw i64 [[VSCALE]], 2
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK:         [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ {{%.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ]
-; CHECK:         {{%.*}} = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; CHECK:         {{%.*}} = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; CHECK:         call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> {{%.*}}, ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]]
+; CHECK-LABEL: define void @trip7_i64(
+; CHECK-SAME: ptr noalias noundef captures(none) [[DST:%.*]], ptr noalias noundef readonly captures(none) [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 7, [[TMP2]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 7)
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], splat (i64 1)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD1]], [[TMP6]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 7)
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NOT:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-NEXT:    [[COND:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NOT]], i32 0
-; CHECK-NEXT:    br i1 [[COND]], label %middle.block, label %vector.body
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 2 x i1> [[TMP9]], i32 0
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
-  br label %for.body
+  br label %loop
 
-for.body:                                         ; preds = %entry, %for.body
-  %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i64, ptr %src, i64 %i.06
-  %0 = load i64, ptr %arrayidx, align 8
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
+  %0 = load i64, ptr %gep.src, align 8
   %mul = shl nsw i64 %0, 1
-  %arrayidx1 = getelementptr inbounds i64, ptr %dst, i64 %i.06
-  %1 = load i64, ptr %arrayidx1, align 8
+  %gep.dst = getelementptr inbounds i64, ptr %dst, i64 %iv
+  %1 = load i64, ptr %gep.dst, align 8
   %add = add nsw i64 %1, %mul
-  store i64 %add, ptr %arrayidx1, align 8
-  %inc = add nuw nsw i64 %i.06, 1
-  %exitcond.not = icmp eq i64 %inc, 7
-  br i1 %exitcond.not, label %for.end, label %for.body
+  store i64 %add, ptr %gep.dst, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 7
+  br i1 %ec, label %exit, label %loop
 
-for.end:                                          ; preds = %for.body
+exit:
   ret void
 }
 
 define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
-; CHECK-LABEL: @trip5_i8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-LABEL: define void @trip5_i8(
+; CHECK-SAME: ptr noalias noundef captures(none) [[DST:%.*]], ptr noalias noundef readonly captures(none) [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[GEP_SRC]], align 1
 ; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP0]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[GEP_DST]], align 1
 ; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP1]]
-; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[GEP_DST]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 5
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br label %for.body
+  br label %loop
 
-for.body:                                         ; preds = %entry, %for.body
-  %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
-  %0 = load i8, ptr %arrayidx, align 1
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src = getelementptr inbounds i8, ptr %src, i64 %iv
+  %0 = load i8, ptr %gep.src, align 1
   %mul = shl i8 %0, 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
-  %1 = load i8, ptr %arrayidx1, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv
+  %1 = load i8, ptr %gep.dst, align 1
   %add = add i8 %mul, %1
-  store i8 %add, ptr %arrayidx1, align 1
-  %inc = add nuw nsw i64 %i.08, 1
-  %exitcond.not = icmp eq i64 %inc, 5
-  br i1 %exitcond.not, label %for.end, label %for.body
+  store i8 %add, ptr %gep.dst, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 5
+  br i1 %ec, label %exit, label %loop
 
-for.end:                                          ; preds = %for.body
+exit:
   ret void
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/intrinsic.ll b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
index 9c910d7..10d83a4 100644
--- a/llvm/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
@@ -324,6 +324,56 @@ for.end:                                          ; preds = %for.body, %entry
 
 declare double @llvm.exp2.f64(double)
 
+define void @ldexp_f32i32(i32 %n, ptr %y, ptr %x, i32 %exp) {
+; CHECK-LABEL: @ldexp_f32i32(
+; CHECK: llvm.ldexp.v4f32.v4i32
+; CHECK: ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.ldexp.f32.i32(float %0, i32 %exp)
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i32 %iv
+  store float %call, ptr %arrayidx2, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.ldexp.f32.i32(float, i32)
+
+define void @ldexp_f64i32(i32 %n, ptr %y, ptr %x, i32 %exp) {
+; CHECK-LABEL: @ldexp_f64i32(
+; CHECK: llvm.ldexp.v4f64.v4i32
+; CHECK: ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call double @llvm.ldexp.f64.i32(double %0, i32 %exp)
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i32 %iv
+  store double %call, ptr %arrayidx2, align 8
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.ldexp.f64.i32(double, i32)
+
 define void @log_f32(i32 %n, ptr %y, ptr %x) {
 ; CHECK-LABEL: @log_f32(
 ; CHECK: llvm.log.v4f32
@@ -976,6 +1026,157 @@ for.end:                                          ; preds = %for.body, %entry
 
 declare double @llvm.roundeven.f64(double)
 
+
+define void @lround_i32f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i32f32(
+; CHECK: llvm.lround.v4i32.v4f32
+; CHECK: ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call i32 @llvm.lround.i32.f32(float %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr %x, i32 %iv
+  store i32 %call, ptr %arrayidx2, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i32 @llvm.lround.i32.f32(float)
+
+define void @lround_i32f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i32f64(
+; CHECK: llvm.lround.v4i32.v4f64
+; CHECK: ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call i32 @llvm.lround.i32.f64(double %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr %x, i32 %iv
+  store i32 %call, ptr %arrayidx2, align 8
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i32 @llvm.lround.i32.f64(double)
+
+define void @lround_i64f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i64f32(
+; CHECK: llvm.lround.v4i64.v4f32
+; CHECK: ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call i64 @llvm.lround.i64.f32(float %0)
+  %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+  store i64 %call, ptr %arrayidx2, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64 @llvm.lround.i64.f32(float)
+
+define void @lround_i64f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i64f64(
+; CHECK: llvm.lround.v4i64.v4f64
+; CHECK: ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call i64 @llvm.lround.i64.f64(double %0)
+  %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+  store i64 %call, ptr %arrayidx2, align 8
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64 @llvm.lround.i64.f64(double)
+
+define void @llround_i64f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @llround_i64f32(
+; CHECK: llvm.llround.v4i64.v4f32
+; CHECK: ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call i64 @llvm.llround.i64.f32(float %0)
+  %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+  store i64 %call, ptr %arrayidx2, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64 @llvm.llround.i64.f32(float)
+
+define void @llround_i64f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @llround_i64f64(
+; CHECK: llvm.llround.v4i64.v4f64
+; CHECK: ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call i64 @llvm.llround.i64.f64(double %0)
+  %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+  store i64 %call, ptr %arrayidx2, align 8
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64 @llvm.llround.i64.f64(double)
+
 define void @fma_f32(i32 %n, ptr %y, ptr %x, ptr %z, ptr %w) {
 ; CHECK-LABEL: @fma_f32(
 ; CHECK: llvm.fma.v4f32
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
index 4427699..9e086dca 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
@@ -8,15 +8,18 @@ target triple = "aarch64--linux-gnu"
 define void @test1(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[XMIN:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
 ; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT:    [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
+; CHECK-NEXT:    [[CONV5:%.*]] = sitofp i32 [[YMIN:%.*]] to float
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[XMIN:%.*]] to float
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[J:%.*]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fsub fast float [[CONV]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], ptr [[J]], i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[SUB10:%.*]] = fsub fast float [[CONV5]], [[TMP1]]
+; CHECK-NEXT:    [[MUL11:%.*]] = fmul fast float [[SUB]], [[SUB]]
+; CHECK-NEXT:    [[MUL12:%.*]] = fmul fast float [[SUB10]], [[SUB10]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL11]], [[MUL12]]
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
 ; CHECK:       for.end27:
@@ -47,15 +50,18 @@ for.end27:
 define void @test2(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[XMIN:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
 ; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT:    [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
+; CHECK-NEXT:    [[CONV5:%.*]] = sitofp i32 [[YMIN:%.*]] to float
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[XMIN:%.*]] to float
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[J:%.*]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fsub fast float [[CONV]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], ptr [[J]], i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[SUB10:%.*]] = fsub fast float [[CONV5]], [[TMP1]]
+; CHECK-NEXT:    [[MUL11:%.*]] = fmul fast float [[SUB]], [[SUB]]
+; CHECK-NEXT:    [[MUL12:%.*]] = fmul fast float [[SUB10]], [[SUB10]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL12]], [[MUL11]]
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
 ; CHECK:       for.end27:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/exp.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/exp.ll
new file mode 100644
index 0000000..301e5da
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/exp.ll
@@ -0,0 +1,279 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+define void @ldexp_f32i32(ptr %x, ptr %y, i32 %exp) {
+; CHECK-LABEL: @ldexp_f32i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load float, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call float @llvm.ldexp.f32.i32(float [[L0]], i32 [[EXP:%.*]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call float @llvm.ldexp.f32.i32(float [[L2]], i32 [[EXP]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call float @llvm.ldexp.f32.i32(float [[L4]], i32 [[EXP]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call float @llvm.ldexp.f32.i32(float [[L6]], i32 [[EXP]])
+; CHECK-NEXT:    store float [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 1
+; CHECK-NEXT:    store float [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 2
+; CHECK-NEXT:    store float [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 3
+; CHECK-NEXT:    store float [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load float, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds float, ptr %x, i64 1
+  %l2 = load float, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, ptr %x, i64 2
+  %l4 = load float, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds float, ptr %x, i64 3
+  %l6 = load float, ptr %arrayidx.3, align 4
+  %l1 = tail call float @llvm.ldexp.f32.i32(float %l0, i32 %exp)
+  %l3 = tail call float @llvm.ldexp.f32.i32(float %l2, i32 %exp)
+  %l5 = tail call float @llvm.ldexp.f32.i32(float %l4, i32 %exp)
+  %l7 = tail call float @llvm.ldexp.f32.i32(float %l6, i32 %exp)
+  store float %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds float, ptr %y, i64 1
+  store float %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds float, ptr %y, i64 2
+  store float %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds float, ptr %y, i64 3
+  store float %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @ldexp_f64i32(ptr %x, ptr %y, i32 %exp) {
+; CHECK-LABEL: @ldexp_f64i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load double, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load double, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load double, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call double @llvm.ldexp.f64.i32(double [[L0]], i32 [[EXP:%.*]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call double @llvm.ldexp.f64.i32(double [[L2]], i32 [[EXP]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call double @llvm.ldexp.f64.i32(double [[L4]], i32 [[EXP]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call double @llvm.ldexp.f64.i32(double [[L6]], i32 [[EXP]])
+; CHECK-NEXT:    store double [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 1
+; CHECK-NEXT:    store double [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 2
+; CHECK-NEXT:    store double [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 3
+; CHECK-NEXT:    store double [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load double, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds double, ptr %x, i64 1
+  %l2 = load double, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds double, ptr %x, i64 2
+  %l4 = load double, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds double, ptr %x, i64 3
+  %l6 = load double, ptr %arrayidx.3, align 4
+  %l1 = tail call double @llvm.ldexp.f64.i32(double %l0, i32 %exp)
+  %l3 = tail call double @llvm.ldexp.f64.i32(double %l2, i32 %exp)
+  %l5 = tail call double @llvm.ldexp.f64.i32(double %l4, i32 %exp)
+  %l7 = tail call double @llvm.ldexp.f64.i32(double %l6, i32 %exp)
+  store double %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds double, ptr %y, i64 1
+  store double %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds double, ptr %y, i64 2
+  store double %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds double, ptr %y, i64 3
+  store double %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @ldexp_f32i64(ptr %x, ptr %y, i64 %exp) {
+; CHECK-LABEL: @ldexp_f32i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load float, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call float @llvm.ldexp.f32.i64(float [[L0]], i64 [[EXP:%.*]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call float @llvm.ldexp.f32.i64(float [[L2]], i64 [[EXP]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call float @llvm.ldexp.f32.i64(float [[L4]], i64 [[EXP]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call float @llvm.ldexp.f32.i64(float [[L6]], i64 [[EXP]])
+; CHECK-NEXT:    store float [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 1
+; CHECK-NEXT:    store float [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 2
+; CHECK-NEXT:    store float [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 3
+; CHECK-NEXT:    store float [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load float, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds float, ptr %x, i64 1
+  %l2 = load float, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, ptr %x, i64 2
+  %l4 = load float, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds float, ptr %x, i64 3
+  %l6 = load float, ptr %arrayidx.3, align 4
+  %l1 = tail call float @llvm.ldexp.f32.i64(float %l0, i64 %exp)
+  %l3 = tail call float @llvm.ldexp.f32.i64(float %l2, i64 %exp)
+  %l5 = tail call float @llvm.ldexp.f32.i64(float %l4, i64 %exp)
+  %l7 = tail call float @llvm.ldexp.f32.i64(float %l6, i64 %exp)
+  store float %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds float, ptr %y, i64 1
+  store float %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds float, ptr %y, i64 2
+  store float %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds float, ptr %y, i64 3
+  store float %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @ldexp_f64i64(ptr %x, ptr %y, i64 %exp) {
+; CHECK-LABEL: @ldexp_f64i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load double, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load double, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load double, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call double @llvm.ldexp.f64.i64(double [[L0]], i64 [[EXP:%.*]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call double @llvm.ldexp.f64.i64(double [[L2]], i64 [[EXP]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call double @llvm.ldexp.f64.i64(double [[L4]], i64 [[EXP]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call double @llvm.ldexp.f64.i64(double [[L6]], i64 [[EXP]])
+; CHECK-NEXT:    store double [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 1
+; CHECK-NEXT:    store double [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 2
+; CHECK-NEXT:    store double [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 3
+; CHECK-NEXT:    store double [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load double, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds double, ptr %x, i64 1
+  %l2 = load double, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds double, ptr %x, i64 2
+  %l4 = load double, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds double, ptr %x, i64 3
+  %l6 = load double, ptr %arrayidx.3, align 4
+  %l1 = tail call double @llvm.ldexp.f64.i64(double %l0, i64 %exp)
+  %l3 = tail call double @llvm.ldexp.f64.i64(double %l2, i64 %exp)
+  %l5 = tail call double @llvm.ldexp.f64.i64(double %l4, i64 %exp)
+  %l7 = tail call double @llvm.ldexp.f64.i64(double %l6, i64 %exp)
+  store double %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds double, ptr %y, i64 1
+  store double %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds double, ptr %y, i64 2
+  store double %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds double, ptr %y, i64 3
+  store double %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @ldexp_f32i32_i64(ptr %x, ptr %y, i32 %exp32, i64 %exp64) {
+; CHECK-LABEL: @ldexp_f32i32_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load float, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call float @llvm.ldexp.f32.i32(float [[L0]], i32 [[EXP32:%.*]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call float @llvm.ldexp.f32.i32(float [[L2]], i32 [[EXP32]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call float @llvm.ldexp.f32.i64(float [[L4]], i64 [[EXP64:%.*]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call float @llvm.ldexp.f32.i64(float [[L6]], i64 [[EXP64]])
+; CHECK-NEXT:    store float [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 1
+; CHECK-NEXT:    store float [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 2
+; CHECK-NEXT:    store float [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 3
+; CHECK-NEXT:    store float [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load float, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds float, ptr %x, i64 1
+  %l2 = load float, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, ptr %x, i64 2
+  %l4 = load float, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds float, ptr %x, i64 3
+  %l6 = load float, ptr %arrayidx.3, align 4
+  %l1 = tail call float @llvm.ldexp.f32.i32(float %l0, i32 %exp32)
+  %l3 = tail call float @llvm.ldexp.f32.i32(float %l2, i32 %exp32)
+  %l5 = tail call float @llvm.ldexp.f32.i64(float %l4, i64 %exp64)
+  %l7 = tail call float @llvm.ldexp.f32.i64(float %l6, i64 %exp64)
+  store float %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds float, ptr %y, i64 1
+  store float %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds float, ptr %y, i64 2
+  store float %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds float, ptr %y, i64 3
+  store float %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @ldexp_f64_i32_i64(ptr %x, ptr %y, i32 %exp32, i64 %exp64) {
+; CHECK-LABEL: @ldexp_f64_i32_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load double, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load double, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load double, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call double @llvm.ldexp.f64.i32(double [[L0]], i32 [[EXP32:%.*]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call double @llvm.ldexp.f64.i32(double [[L2]], i32 [[EXP32]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call double @llvm.ldexp.f64.i64(double [[L4]], i64 [[EXP64:%.*]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call double @llvm.ldexp.f64.i64(double [[L6]], i64 [[EXP64]])
+; CHECK-NEXT:    store double [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 1
+; CHECK-NEXT:    store double [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 2
+; CHECK-NEXT:    store double [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 3
+; CHECK-NEXT:    store double [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load double, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds double, ptr %x, i64 1
+  %l2 = load double, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds double, ptr %x, i64 2
+  %l4 = load double, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds double, ptr %x, i64 3
+  %l6 = load double, ptr %arrayidx.3, align 4
+  %l1 = tail call double @llvm.ldexp.f64.i32(double %l0, i32 %exp32)
+  %l3 = tail call double @llvm.ldexp.f64.i32(double %l2, i32 %exp32)
+  %l5 = tail call double @llvm.ldexp.f64.i64(double %l4, i64 %exp64)
+  %l7 = tail call double @llvm.ldexp.f64.i64(double %l6, i64 %exp64)
+  store double %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds double, ptr %y, i64 1
+  store double %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds double, ptr %y, i64 2
+  store double %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds double, ptr %y, i64 3
+  store double %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+declare float @llvm.ldexp.f32.i32(float, i32)
+declare double @llvm.ldexp.f64.i32(double, i32)
+declare float @llvm.ldexp.f32.i64(float, i64)
+declare double @llvm.ldexp.f64.i64(double, i64)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/fround.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/fround.ll
new file mode 100644
index 0000000..07a3fe7
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/fround.ll
@@ -0,0 +1,280 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+define void @lround_i32f32(ptr %x, ptr %y, i32 %n) {
+; CHECK-LABEL: @lround_i32f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load float, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call i32 @llvm.lround.i32.f32(float [[L0]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call i32 @llvm.lround.i32.f32(float [[L2]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call i32 @llvm.lround.i32.f32(float [[L4]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call i32 @llvm.lround.i32.f32(float [[L6]])
+; CHECK-NEXT:    store i32 [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 1
+; CHECK-NEXT:    store i32 [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 2
+; CHECK-NEXT:    store i32 [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 3
+; CHECK-NEXT:    store i32 [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load float, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds float, ptr %x, i64 1
+  %l2 = load float, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, ptr %x, i64 2
+  %l4 = load float, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds float, ptr %x, i64 3
+  %l6 = load float, ptr %arrayidx.3, align 4
+  %l1 = tail call i32 @llvm.lround.i32.f32(float %l0)
+  %l3 = tail call i32 @llvm.lround.i32.f32(float %l2)
+  %l5 = tail call i32 @llvm.lround.i32.f32(float %l4)
+  %l7 = tail call i32 @llvm.lround.i32.f32(float %l6)
+  store i32 %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds i32, ptr %y, i64 1
+  store i32 %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds i32, ptr %y, i64 2
+  store i32 %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds i32, ptr %y, i64 3
+  store i32 %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @lround_i32f64(ptr %x, ptr %y, i32 %n) {
+; CHECK-LABEL: @lround_i32f64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load double, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load double, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load double, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call i32 @llvm.lround.i32.f64(double [[L0]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call i32 @llvm.lround.i32.f64(double [[L2]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call i32 @llvm.lround.i32.f64(double [[L4]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call i32 @llvm.lround.i32.f64(double [[L6]])
+; CHECK-NEXT:    store i32 [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 1
+; CHECK-NEXT:    store i32 [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 2
+; CHECK-NEXT:    store i32 [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 3
+; CHECK-NEXT:    store i32 [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load double, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds double, ptr %x, i64 1
+  %l2 = load double, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds double, ptr %x, i64 2
+  %l4 = load double, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds double, ptr %x, i64 3
+  %l6 = load double, ptr %arrayidx.3, align 4
+  %l1 = tail call i32 @llvm.lround.i32.f64(double %l0)
+  %l3 = tail call i32 @llvm.lround.i32.f64(double %l2)
+  %l5 = tail call i32 @llvm.lround.i32.f64(double %l4)
+  %l7 = tail call i32 @llvm.lround.i32.f64(double %l6)
+  store i32 %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds i32, ptr %y, i64 1
+  store i32 %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds i32, ptr %y, i64 2
+  store i32 %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds i32, ptr %y, i64 3
+  store i32 %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @lround_i64f32(ptr %x, ptr %y, i64 %n) {
+; CHECK-LABEL: @lround_i64f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load float, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call i64 @llvm.lround.i64.f32(float [[L0]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call i64 @llvm.lround.i64.f32(float [[L2]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call i64 @llvm.lround.i64.f32(float [[L4]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call i64 @llvm.lround.i64.f32(float [[L6]])
+; CHECK-NEXT:    store i64 [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 1
+; CHECK-NEXT:    store i64 [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 2
+; CHECK-NEXT:    store i64 [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 3
+; CHECK-NEXT:    store i64 [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load float, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds float, ptr %x, i64 1
+  %l2 = load float, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, ptr %x, i64 2
+  %l4 = load float, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds float, ptr %x, i64 3
+  %l6 = load float, ptr %arrayidx.3, align 4
+  %l1 = tail call i64 @llvm.lround.i64.f32(float %l0)
+  %l3 = tail call i64 @llvm.lround.i64.f32(float %l2)
+  %l5 = tail call i64 @llvm.lround.i64.f32(float %l4)
+  %l7 = tail call i64 @llvm.lround.i64.f32(float %l6)
+  store i64 %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds i64, ptr %y, i64 1
+  store i64 %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds i64, ptr %y, i64 2
+  store i64 %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds i64, ptr %y, i64 3
+  store i64 %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @lround_i64f64(ptr %x, ptr %y, i64 %n) {
+; CHECK-LABEL: @lround_i64f64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load double, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load double, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load double, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call i64 @llvm.lround.i64.f64(double [[L0]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call i64 @llvm.lround.i64.f64(double [[L2]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call i64 @llvm.lround.i64.f64(double [[L4]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call i64 @llvm.lround.i64.f64(double [[L6]])
+; CHECK-NEXT:    store i64 [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 1
+; CHECK-NEXT:    store i64 [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 2
+; CHECK-NEXT:    store i64 [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 3
+; CHECK-NEXT:    store i64 [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load double, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds double, ptr %x, i64 1
+  %l2 = load double, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds double, ptr %x, i64 2
+  %l4 = load double, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds double, ptr %x, i64 3
+  %l6 = load double, ptr %arrayidx.3, align 4
+  %l1 = tail call i64 @llvm.lround.i64.f64(double %l0)
+  %l3 = tail call i64 @llvm.lround.i64.f64(double %l2)
+  %l5 = tail call i64 @llvm.lround.i64.f64(double %l4)
+  %l7 = tail call i64 @llvm.lround.i64.f64(double %l6)
+  store i64 %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds i64, ptr %y, i64 1
+  store i64 %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds i64, ptr %y, i64 2
+  store i64 %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds i64, ptr %y, i64 3
+  store i64 %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @llround_i64f32(ptr %x, ptr %y, i64 %n) {
+; CHECK-LABEL: @llround_i64f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load float, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call i64 @llvm.llround.i64.f32(float [[L0]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call i64 @llvm.llround.i64.f32(float [[L2]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call i64 @llvm.llround.i64.f32(float [[L4]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call i64 @llvm.llround.i64.f32(float [[L6]])
+; CHECK-NEXT:    store i64 [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 1
+; CHECK-NEXT:    store i64 [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 2
+; CHECK-NEXT:    store i64 [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 3
+; CHECK-NEXT:    store i64 [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load float, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds float, ptr %x, i64 1
+  %l2 = load float, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, ptr %x, i64 2
+  %l4 = load float, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds float, ptr %x, i64 3
+  %l6 = load float, ptr %arrayidx.3, align 4
+  %l1 = tail call i64 @llvm.llround.i64.f32(float %l0)
+  %l3 = tail call i64 @llvm.llround.i64.f32(float %l2)
+  %l5 = tail call i64 @llvm.llround.i64.f32(float %l4)
+  %l7 = tail call i64 @llvm.llround.i64.f32(float %l6)
+  store i64 %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds i64, ptr %y, i64 1
+  store i64 %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds i64, ptr %y, i64 2
+  store i64 %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds i64, ptr %y, i64 3
+  store i64 %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @llround_i64f64(ptr %x, ptr %y, i64 %n) {
+; CHECK-LABEL: @llround_i64f64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load double, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load double, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load double, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call i64 @llvm.llround.i64.f64(double [[L0]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call i64 @llvm.llround.i64.f64(double [[L2]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call i64 @llvm.llround.i64.f64(double [[L4]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call i64 @llvm.llround.i64.f64(double [[L6]])
+; CHECK-NEXT:    store i64 [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 1
+; CHECK-NEXT:    store i64 [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 2
+; CHECK-NEXT:    store i64 [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 3
+; CHECK-NEXT:    store i64 [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load double, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds double, ptr %x, i64 1
+  %l2 = load double, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds double, ptr %x, i64 2
+  %l4 = load double, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds double, ptr %x, i64 3
+  %l6 = load double, ptr %arrayidx.3, align 4
+  %l1 = tail call i64 @llvm.llround.i64.f64(double %l0)
+  %l3 = tail call i64 @llvm.llround.i64.f64(double %l2)
+  %l5 = tail call i64 @llvm.llround.i64.f64(double %l4)
+  %l7 = tail call i64 @llvm.llround.i64.f64(double %l6)
+  store i64 %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds i64, ptr %y, i64 1
+  store i64 %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds i64, ptr %y, i64 2
+  store i64 %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds i64, ptr %y, i64 3
+  store i64 %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+declare i32 @llvm.lround.i32.f32(float)
+declare i64 @llvm.lround.i64.f32(float)
+declare i64 @llvm.lround.i64.f64(double)
+declare i64 @llvm.llround.i64.f32(float)
+declare i64 @llvm.llround.i64.f64(double)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
index 295a718..2e68432 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
@@ -12,7 +12,8 @@ define void @test() {
 ; CHECK:       [[BB63]]:
 ; CHECK-NEXT:    br label %[[BB64]]
 ; CHECK:       [[BB64]]:
-; CHECK-NEXT:    [[TMP25:%.*]] = phi <16 x float> [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
+; CHECK-NEXT:    [[I65:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
+; CHECK-NEXT:    [[I77:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
 ; CHECK-NEXT:    [[I66:%.*]] = load float, ptr poison, align 16
 ; CHECK-NEXT:    [[I67:%.*]] = load float, ptr poison, align 4
 ; CHECK-NEXT:    [[I68:%.*]] = load float, ptr poison, align 8
@@ -24,57 +25,125 @@ define void @test() {
 ; CHECK-NEXT:    [[I74:%.*]] = load float, ptr poison, align 4
 ; CHECK-NEXT:    [[I75:%.*]] = load float, ptr poison, align 16
 ; CHECK-NEXT:    [[I76:%.*]] = load float, ptr poison, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x float> poison, float [[I76]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x float> [[TMP1]], float [[I75]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x float> [[TMP2]], float [[I74]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x float> [[TMP3]], float [[I73]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x float> [[TMP4]], float [[I71]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x float> [[TMP5]], float [[I70]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x float> [[TMP6]], float [[I68]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x float> [[TMP7]], float [[I66]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x float> [[TMP8]], float [[I72]], i32 13
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x float> [[TMP9]], float [[I67]], i32 14
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x float> [[TMP10]], float [[I69]], i32 15
 ; CHECK-NEXT:    br i1 poison, label %[[BB167:.*]], label %[[BB77:.*]]
 ; CHECK:       [[BB77]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 14, i32 15, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP17]], <8 x i32> <i32 8, i32 poison, i32 poison, i32 poison, i32 4, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x float> [[TMP14]], float [[I68]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP19]], float [[I66]], i32 3
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <8 x float> [[TMP16]], float [[I67]], i32 6
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <8 x float> [[TMP20]], float [[I69]], i32 7
-; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x float> [[TMP25]], <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x float> [[TMP39]], <16 x float> [[TMP25]], <16 x i32> <i32 poison, i32 poison, i32 2, i32 3, i32 18, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 poison, i32 poison>
 ; CHECK-NEXT:    br label %[[BB78:.*]]
 ; CHECK:       [[BB78]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <8 x float> [ [[TMP23]], %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = phi <8 x float> [ [[TMP21]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <16 x i32> <i32 0, i32 3, i32 1, i32 2, i32 3, i32 0, i32 2, i32 3, i32 2, i32 6, i32 2, i32 3, i32 0, i32 7, i32 6, i32 6>
-; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 1, i32 3, i32 5, i32 3, i32 1, i32 0, i32 4, i32 5, i32 5>
-; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP13]]
-; CHECK-NEXT:    [[TMP26:%.*]] = fmul fast <16 x float> [[TMP38]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]]
-; CHECK-NEXT:    [[TMP28:%.*]] = fadd fast <16 x float> [[TMP27]], poison
-; CHECK-NEXT:    [[TMP29:%.*]] = fadd fast <16 x float> [[TMP28]], poison
-; CHECK-NEXT:    [[TMP36]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> <i32 5, i32 11, i32 12, i32 10, i32 14, i32 15, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP31]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> <i32 12, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 14, i32 15>
+; CHECK-NEXT:    [[I85:%.*]] = phi nsz float [ [[I66]], %[[BB77]] ], [ [[I103:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I80:%.*]] = phi nsz float [ [[I67]], %[[BB77]] ], [ [[I104:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I81:%.*]] = phi nsz float [ [[I68]], %[[BB77]] ], [ [[I105:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I82:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I106:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I84:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I123:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I127:%.*]] = phi nsz float [ [[I69]], %[[BB77]] ], [ [[I124:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I131:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I125:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I86:%.*]] = phi nsz float [ [[I70]], %[[BB77]] ], [ [[I126:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I87:%.*]] = fmul fast float [[I85]], poison
+; CHECK-NEXT:    [[I88:%.*]] = fmul fast float [[I80]], poison
+; CHECK-NEXT:    [[I89:%.*]] = fmul fast float [[I81]], poison
+; CHECK-NEXT:    [[I90:%.*]] = fmul fast float [[I82]], poison
+; CHECK-NEXT:    [[I91:%.*]] = fmul fast float [[I84]], poison
+; CHECK-NEXT:    [[I92:%.*]] = fadd fast float [[I91]], [[I87]]
+; CHECK-NEXT:    [[I93:%.*]] = fmul fast float [[I127]], poison
+; CHECK-NEXT:    [[I94:%.*]] = fadd fast float [[I93]], [[I88]]
+; CHECK-NEXT:    [[I95:%.*]] = fmul fast float [[I131]], poison
+; CHECK-NEXT:    [[I96:%.*]] = fadd fast float [[I95]], [[I89]]
+; CHECK-NEXT:    [[I97:%.*]] = fmul fast float [[I86]], poison
+; CHECK-NEXT:    [[I98:%.*]] = fadd fast float [[I97]], [[I90]]
+; CHECK-NEXT:    [[I99:%.*]] = fadd fast float [[I92]], poison
+; CHECK-NEXT:    [[I100:%.*]] = fadd fast float [[I94]], poison
+; CHECK-NEXT:    [[I101:%.*]] = fadd fast float [[I96]], poison
+; CHECK-NEXT:    [[I102:%.*]] = fadd fast float [[I98]], poison
+; CHECK-NEXT:    [[I103]] = fadd fast float [[I99]], poison
+; CHECK-NEXT:    [[I104]] = fadd fast float [[I100]], poison
+; CHECK-NEXT:    [[I105]] = fadd fast float [[I101]], poison
+; CHECK-NEXT:    [[I106]] = fadd fast float [[I102]], poison
+; CHECK-NEXT:    [[I107:%.*]] = fmul fast float [[I85]], poison
+; CHECK-NEXT:    [[I108:%.*]] = fmul fast float [[I80]], poison
+; CHECK-NEXT:    [[I109:%.*]] = fmul fast float [[I81]], poison
+; CHECK-NEXT:    [[I110:%.*]] = fmul fast float [[I82]], poison
+; CHECK-NEXT:    [[I111:%.*]] = fmul fast float [[I84]], poison
+; CHECK-NEXT:    [[I112:%.*]] = fadd fast float [[I111]], [[I107]]
+; CHECK-NEXT:    [[I113:%.*]] = fmul fast float [[I127]], poison
+; CHECK-NEXT:    [[I114:%.*]] = fadd fast float [[I113]], [[I108]]
+; CHECK-NEXT:    [[I115:%.*]] = fmul fast float [[I131]], poison
+; CHECK-NEXT:    [[I116:%.*]] = fadd fast float [[I115]], [[I109]]
+; CHECK-NEXT:    [[I117:%.*]] = fmul fast float [[I86]], poison
+; CHECK-NEXT:    [[I118:%.*]] = fadd fast float [[I117]], [[I110]]
+; CHECK-NEXT:    [[I119:%.*]] = fadd fast float [[I112]], poison
+; CHECK-NEXT:    [[I120:%.*]] = fadd fast float [[I114]], poison
+; CHECK-NEXT:    [[I121:%.*]] = fadd fast float [[I116]], poison
+; CHECK-NEXT:    [[I122:%.*]] = fadd fast float [[I118]], poison
+; CHECK-NEXT:    [[I123]] = fadd fast float [[I119]], poison
+; CHECK-NEXT:    [[I124]] = fadd fast float [[I120]], poison
+; CHECK-NEXT:    [[I125]] = fadd fast float [[I121]], poison
+; CHECK-NEXT:    [[I126]] = fadd fast float [[I122]], poison
+; CHECK-NEXT:    [[I135:%.*]] = fmul fast float [[I85]], [[I65]]
+; CHECK-NEXT:    [[I128:%.*]] = fmul fast float [[I80]], [[I65]]
+; CHECK-NEXT:    [[I129:%.*]] = fmul fast float [[I81]], [[I65]]
+; CHECK-NEXT:    [[I130:%.*]] = fmul fast float [[I82]], [[I65]]
+; CHECK-NEXT:    [[I133:%.*]] = fmul fast float [[I84]], [[I77]]
+; CHECK-NEXT:    [[I134:%.*]] = fadd fast float [[I133]], [[I135]]
+; CHECK-NEXT:    [[I136:%.*]] = fmul fast float [[I127]], [[I77]]
+; CHECK-NEXT:    [[TMP51:%.*]] = fadd fast float [[I136]], [[I128]]
+; CHECK-NEXT:    [[I138:%.*]] = fmul fast float [[I131]], [[I77]]
+; CHECK-NEXT:    [[TMP52:%.*]] = fadd fast float [[I138]], [[I129]]
+; CHECK-NEXT:    [[I137:%.*]] = fmul fast float [[I86]], [[I77]]
+; CHECK-NEXT:    [[I139:%.*]] = fadd fast float [[I137]], [[I130]]
+; CHECK-NEXT:    [[I140:%.*]] = fadd fast float [[I134]], poison
+; CHECK-NEXT:    [[I141:%.*]] = fadd fast float [[TMP51]], poison
+; CHECK-NEXT:    [[I142:%.*]] = fadd fast float [[TMP52]], poison
+; CHECK-NEXT:    [[I143:%.*]] = fadd fast float [[I139]], poison
+; CHECK-NEXT:    [[I144:%.*]] = fadd fast float [[I140]], poison
+; CHECK-NEXT:    [[I145:%.*]] = fadd fast float [[I141]], poison
+; CHECK-NEXT:    [[I146:%.*]] = fadd fast float [[I142]], poison
+; CHECK-NEXT:    [[I152:%.*]] = fadd fast float [[I143]], poison
+; CHECK-NEXT:    [[I147:%.*]] = fmul fast float [[I85]], poison
+; CHECK-NEXT:    [[I148:%.*]] = fmul fast float [[I80]], poison
+; CHECK-NEXT:    [[I149:%.*]] = fmul fast float [[I81]], poison
+; CHECK-NEXT:    [[I150:%.*]] = fmul fast float [[I82]], poison
+; CHECK-NEXT:    [[I151:%.*]] = fmul fast float [[I84]], poison
+; CHECK-NEXT:    [[TMP57:%.*]] = fadd fast float [[I151]], [[I147]]
+; CHECK-NEXT:    [[I153:%.*]] = fmul fast float [[I127]], poison
+; CHECK-NEXT:    [[TMP58:%.*]] = fadd fast float [[I153]], [[I148]]
+; CHECK-NEXT:    [[I155:%.*]] = fmul fast float [[I131]], poison
+; CHECK-NEXT:    [[TMP59:%.*]] = fadd fast float [[I155]], [[I149]]
+; CHECK-NEXT:    [[I157:%.*]] = fmul fast float [[I86]], poison
+; CHECK-NEXT:    [[TMP60:%.*]] = fadd fast float [[I157]], [[I150]]
+; CHECK-NEXT:    [[I159:%.*]] = fadd fast float [[TMP57]], poison
+; CHECK-NEXT:    [[I160:%.*]] = fadd fast float [[TMP58]], poison
+; CHECK-NEXT:    [[I161:%.*]] = fadd fast float [[TMP59]], poison
+; CHECK-NEXT:    [[I162:%.*]] = fadd fast float [[TMP60]], poison
+; CHECK-NEXT:    [[I163:%.*]] = fadd fast float [[I159]], poison
+; CHECK-NEXT:    [[I164:%.*]] = fadd fast float [[I160]], poison
+; CHECK-NEXT:    [[I165:%.*]] = fadd fast float [[I161]], poison
+; CHECK-NEXT:    [[I166:%.*]] = fadd fast float [[I162]], poison
 ; CHECK-NEXT:    br i1 poison, label %[[BB78]], label %[[BB167]]
 ; CHECK:       [[BB167]]:
-; CHECK-NEXT:    [[TMP32:%.*]] = phi <16 x float> [ [[TMP11]], %[[BB64]] ], [ [[TMP29]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x float> [[TMP32]], i32 14
+; CHECK-NEXT:    [[I168:%.*]] = phi nsz float [ [[I76]], %[[BB64]] ], [ [[I166]], %[[BB78]] ]
+; CHECK-NEXT:    [[I169:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I165]], %[[BB78]] ]
+; CHECK-NEXT:    [[I170:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I164]], %[[BB78]] ]
+; CHECK-NEXT:    [[I171:%.*]] = phi nsz float [ [[I75]], %[[BB64]] ], [ [[I163]], %[[BB78]] ]
+; CHECK-NEXT:    [[I172:%.*]] = phi nsz float [ [[I74]], %[[BB64]] ], [ [[I152]], %[[BB78]] ]
+; CHECK-NEXT:    [[I173:%.*]] = phi nsz float [ [[I73]], %[[BB64]] ], [ [[I146]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP34:%.*]] = phi nsz float [ [[I72]], %[[BB64]] ], [ [[I145]], %[[BB78]] ]
+; CHECK-NEXT:    [[I175:%.*]] = phi nsz float [ [[I71]], %[[BB64]] ], [ [[I144]], %[[BB78]] ]
+; CHECK-NEXT:    [[I176:%.*]] = phi nsz float [ [[I70]], %[[BB64]] ], [ [[I126]], %[[BB78]] ]
+; CHECK-NEXT:    [[I177:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I125]], %[[BB78]] ]
+; CHECK-NEXT:    [[I178:%.*]] = phi nsz float [ [[I69]], %[[BB64]] ], [ [[I124]], %[[BB78]] ]
+; CHECK-NEXT:    [[I179:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I123]], %[[BB78]] ]
+; CHECK-NEXT:    [[I180:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I106]], %[[BB78]] ]
+; CHECK-NEXT:    [[I181:%.*]] = phi nsz float [ [[I68]], %[[BB64]] ], [ [[I105]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = phi nsz float [ [[I67]], %[[BB64]] ], [ [[I104]], %[[BB78]] ]
+; CHECK-NEXT:    [[I183:%.*]] = phi nsz float [ [[I66]], %[[BB64]] ], [ [[I103]], %[[BB78]] ]
 ; CHECK-NEXT:    store float [[TMP33]], ptr poison, align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x float> [[TMP32]], i32 13
 ; CHECK-NEXT:    store float [[TMP34]], ptr poison, align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x float> [[TMP32]], i32 15
 ; CHECK-NEXT:    br i1 poison, label %[[BB186:.*]], label %[[BB184:.*]]
 ; CHECK:       [[BB184]]:
 ; CHECK-NEXT:    br label %[[BB185:.*]]
 ; CHECK:       [[BB185]]:
 ; CHECK-NEXT:    br i1 poison, label %[[BB185]], label %[[BB186]]
 ; CHECK:       [[BB186]]:
-; CHECK-NEXT:    [[I187:%.*]] = phi nsz float [ [[TMP35]], %[[BB167]] ], [ poison, %[[BB185]] ]
+; CHECK-NEXT:    [[I187:%.*]] = phi nsz float [ [[I178]], %[[BB167]] ], [ poison, %[[BB185]] ]
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
index 64bdcf2..8093285 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
@@ -8,35 +8,56 @@
 define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) {
 ; CHECK-LABEL: @zot(
 ; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[VAL:%.*]] = fmul fast float 0.000000e+00, 0.000000e+00
 ; CHECK-NEXT:    [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float 0.000000e+00, float poison, float poison, float poison>, float [[ARG]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[ARG3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x float> [[TMP4]], <float 1.000000e+00, float 0.000000e+00>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP9]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[TMP6]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[VAL10:%.*]] = fmul fast float [[ARG3:%.*]], 1.000000e+00
+; CHECK-NEXT:    [[VAL11:%.*]] = fmul fast float [[ARG3]], 1.000000e+00
+; CHECK-NEXT:    [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
+; CHECK-NEXT:    [[VAL13:%.*]] = fadd fast float [[VAL12]], 2.000000e+00
+; CHECK-NEXT:    [[VAL14:%.*]] = fadd fast float 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[VAL15:%.*]] = fadd fast float [[VAL14]], 1.000000e+00
+; CHECK-NEXT:    [[VAL16:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
+; CHECK-NEXT:    [[VAL17:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
 ; CHECK-NEXT:    br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]]
 ; CHECK:       bb18:
-; CHECK-NEXT:    [[TMP8:%.*]] = phi <4 x float> [ [[TMP7]], [[BB:%.*]] ]
-; CHECK-NEXT:    [[VAL16:%.*]] = extractelement <4 x float> [[TMP7]], i32 2
+; CHECK-NEXT:    [[VAL19:%.*]] = phi float [ [[VAL13]], [[BB:%.*]] ]
+; CHECK-NEXT:    [[VAL20:%.*]] = phi float [ [[VAL15]], [[BB]] ]
+; CHECK-NEXT:    [[VAL21:%.*]] = phi float [ [[VAL16]], [[BB]] ]
+; CHECK-NEXT:    [[VAL22:%.*]] = phi float [ [[VAL17]], [[BB]] ]
 ; CHECK-NEXT:    [[VAL23:%.*]] = fmul fast float [[VAL16]], 2.000000e+00
-; CHECK-NEXT:    [[VAL17:%.*]] = extractelement <4 x float> [[TMP7]], i32 3
 ; CHECK-NEXT:    [[VAL24:%.*]] = fmul fast float [[VAL17]], 3.000000e+00
 ; CHECK-NEXT:    br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]]
 ; CHECK:       bb25:
-; CHECK-NEXT:    [[TMP11:%.*]] = phi <4 x float> [ [[TMP8]], [[BB18]] ]
+; CHECK-NEXT:    [[VAL26:%.*]] = phi float [ [[VAL19]], [[BB18]] ]
+; CHECK-NEXT:    [[VAL27:%.*]] = phi float [ [[VAL20]], [[BB18]] ]
+; CHECK-NEXT:    [[VAL28:%.*]] = phi float [ [[VAL21]], [[BB18]] ]
+; CHECK-NEXT:    [[VAL29:%.*]] = phi float [ [[VAL22]], [[BB18]] ]
 ; CHECK-NEXT:    br label [[BB30:%.*]]
 ; CHECK:       bb30:
 ; CHECK-NEXT:    [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
 ; CHECK-NEXT:    [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = uitofp <4 x i8> [[TMP12]] to <4 x float>
-; CHECK-NEXT:    [[TMP14:%.*]] = fsub fast <4 x float> [[TMP13]], [[TMP3]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <4 x float> [[TMP14]], [[TMP11]]
-; CHECK-NEXT:    [[VAL54:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP15]])
+; CHECK-NEXT:    [[VAL33:%.*]] = load i8, ptr [[ARG5:%.*]], align 1
+; CHECK-NEXT:    [[VAL34:%.*]] = uitofp i8 [[VAL33]] to float
+; CHECK-NEXT:    [[VAL35:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 1
+; CHECK-NEXT:    [[VAL36:%.*]] = load i8, ptr [[VAL35]], align 1
+; CHECK-NEXT:    [[VAL37:%.*]] = uitofp i8 [[VAL36]] to float
+; CHECK-NEXT:    [[VAL38:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 2
+; CHECK-NEXT:    [[VAL39:%.*]] = load i8, ptr [[VAL38]], align 1
+; CHECK-NEXT:    [[VAL40:%.*]] = uitofp i8 [[VAL39]] to float
+; CHECK-NEXT:    [[VAL41:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 3
+; CHECK-NEXT:    [[VAL42:%.*]] = load i8, ptr [[VAL41]], align 1
+; CHECK-NEXT:    [[VAL43:%.*]] = uitofp i8 [[VAL42]] to float
+; CHECK-NEXT:    [[VAL44:%.*]] = fsub fast float [[VAL34]], [[VAL]]
+; CHECK-NEXT:    [[VAL45:%.*]] = fsub fast float [[VAL37]], [[VAL9]]
+; CHECK-NEXT:    [[VAL46:%.*]] = fsub fast float [[VAL40]], [[VAL10]]
+; CHECK-NEXT:    [[VAL47:%.*]] = fsub fast float [[VAL43]], [[VAL11]]
+; CHECK-NEXT:    [[VAL48:%.*]] = fmul fast float [[VAL44]], [[VAL26]]
+; CHECK-NEXT:    [[VAL49:%.*]] = fmul fast float [[VAL45]], [[VAL27]]
+; CHECK-NEXT:    [[VAL50:%.*]] = fadd fast float [[VAL49]], [[VAL48]]
+; CHECK-NEXT:    [[VAL51:%.*]] = fmul fast float [[VAL46]], [[VAL28]]
+; CHECK-NEXT:    [[VAL52:%.*]] = fadd fast float [[VAL50]], [[VAL51]]
+; CHECK-NEXT:    [[VAL53:%.*]] = fmul fast float [[VAL47]], [[VAL29]]
+; CHECK-NEXT:    [[VAL54:%.*]] = fadd fast float [[VAL52]], [[VAL53]]
 ; CHECK-NEXT:    [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]])
 ; CHECK-NEXT:    [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[VAL54]])
 ; CHECK-NEXT:    call void @ham(float [[VAL55]], float [[VAL56]])
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index 27de36e..430a46b 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -600,29 +600,25 @@ define i32 @dot_product_i32_reorder(ptr %a, ptr %b) {
 }
 
 define float @dot_product_fp32(ptr %a, ptr %b) {
-; NON-POW2-LABEL: @dot_product_fp32(
-; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
-; NON-POW2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
-; NON-POW2-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
-; NON-POW2-NEXT:    ret float [[TMP4]]
-;
-; POW2-ONLY-LABEL: @dot_product_fp32(
-; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
-; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
-; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
-; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
-; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
-; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]])
-; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
-; POW2-ONLY-NEXT:    ret float [[ADD_1]]
+; CHECK-LABEL: @dot_product_fp32(
+; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
+; CHECK-NEXT:    [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1
+; CHECK-NEXT:    [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
+; CHECK-NEXT:    [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
+; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
+; CHECK-NEXT:    [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4
+; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1
+; CHECK-NEXT:    [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
+; CHECK-NEXT:    [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
+; CHECK-NEXT:    ret float [[ADD_1]]
 ;
   %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
   %l.a.0 = load float, ptr %gep.a.0, align 4
@@ -650,29 +646,25 @@ define float @dot_product_fp32(ptr %a, ptr %b) {
 ; Same as above, except the reduction order has been perturbed.  This
 ; is checking for our ability to reorder.
 define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
-; NON-POW2-LABEL: @dot_product_fp32_reorder(
-; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
-; NON-POW2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
-; NON-POW2-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
-; NON-POW2-NEXT:    ret float [[TMP4]]
-;
-; POW2-ONLY-LABEL: @dot_product_fp32_reorder(
-; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
-; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
-; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
-; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
-; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
-; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]])
-; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
-; POW2-ONLY-NEXT:    ret float [[ADD_1]]
+; CHECK-LABEL: @dot_product_fp32_reorder(
+; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
+; CHECK-NEXT:    [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1
+; CHECK-NEXT:    [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
+; CHECK-NEXT:    [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
+; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
+; CHECK-NEXT:    [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4
+; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1
+; CHECK-NEXT:    [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
+; CHECK-NEXT:    [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd fast float [[MUL_1]], [[MUL_0]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
+; CHECK-NEXT:    ret float [[ADD_1]]
 ;
   %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
   %l.a.0 = load float, ptr %gep.a.0, align 4
@@ -699,29 +691,25 @@ define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
 
 
 define double @dot_product_fp64(ptr %a, ptr %b) {
-; NON-POW2-LABEL: @dot_product_fp64(
-; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
-; NON-POW2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x double>, ptr [[GEP_A_0]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x double>, ptr [[GEP_B_0]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = fmul fast <3 x double> [[TMP1]], [[TMP2]]
-; NON-POW2-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]])
-; NON-POW2-NEXT:    ret double [[TMP4]]
-;
-; POW2-ONLY-LABEL: @dot_product_fp64(
-; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
-; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
-; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
-; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[GEP_A_0]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[GEP_B_0]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]]
-; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
-; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP3]])
-; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
-; POW2-ONLY-NEXT:    ret double [[ADD_1]]
+; CHECK-LABEL: @dot_product_fp64(
+; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
+; CHECK-NEXT:    [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds double, ptr [[A]], i32 1
+; CHECK-NEXT:    [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
+; CHECK-NEXT:    [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
+; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
+; CHECK-NEXT:    [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4
+; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds double, ptr [[B]], i32 1
+; CHECK-NEXT:    [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
+; CHECK-NEXT:    [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast double [[L_A_0]], [[L_B_0]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast double [[L_A_1]], [[L_B_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd fast double [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
+; CHECK-NEXT:    ret double [[ADD_1]]
 ;
   %gep.a.0 = getelementptr inbounds double, ptr %a, i32 0
   %l.a.0 = load double, ptr %gep.a.0, align 4
@@ -778,21 +766,13 @@ entry:
 }
 
 define float @reduce_fadd_after_fmul_of_buildvec(float %a, float %b, float %c) {
-; NON-POW2-LABEL: @reduce_fadd_after_fmul_of_buildvec(
-; NON-POW2-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> poison, float [[A:%.*]], i32 0
-; NON-POW2-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[B:%.*]], i32 1
-; NON-POW2-NEXT:    [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[C:%.*]], i32 2
-; NON-POW2-NEXT:    [[TMP4:%.*]] = fmul fast <3 x float> [[TMP3]], splat (float 1.000000e+01)
-; NON-POW2-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP4]])
-; NON-POW2-NEXT:    ret float [[TMP5]]
-;
-; POW2-ONLY-LABEL: @reduce_fadd_after_fmul_of_buildvec(
-; POW2-ONLY-NEXT:    [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
-; POW2-ONLY-NEXT:    [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
-; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
-; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
-; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
-; POW2-ONLY-NEXT:    ret float [[ADD_1]]
+; CHECK-LABEL: @reduce_fadd_after_fmul_of_buildvec(
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
+; CHECK-NEXT:    ret float [[ADD_1]]
 ;
   %mul.0 = fmul fast float %a, 10.0
   %mul.1 = fmul fast float %b, 10.0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
index 4a8af6d..0879ec2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
@@ -2,7 +2,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE4
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
 
 ;
 ; dot4(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])+(xptr y[2])+(xptr y[3]))
@@ -95,12 +95,47 @@ define float @dot4f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt
 }
 
 define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
-; CHECK-LABEL: @dot4f64_fast(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
-; CHECK-NEXT:    ret double [[TMP4]]
+; SSE2-LABEL: @dot4f64_fast(
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
+; SSE2-NEXT:    ret double [[TMP4]]
+;
+; SSE4-LABEL: @dot4f64_fast(
+; SSE4-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
+; SSE4-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
+; SSE4-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
+; SSE4-NEXT:    ret double [[TMP4]]
+;
+; AVX-LABEL: @dot4f64_fast(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
+; AVX-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
+; AVX-NEXT:    ret double [[TMP4]]
+;
+; AVX2-LABEL: @dot4f64_fast(
+; AVX2-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
+; AVX2-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
+; AVX2-NEXT:    [[PTRX2:%.*]] = getelementptr inbounds double, ptr [[PTRX]], i64 2
+; AVX2-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds double, ptr [[PTRY]], i64 2
+; AVX2-NEXT:    [[X0:%.*]] = load double, ptr [[PTRX]], align 4
+; AVX2-NEXT:    [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
+; AVX2-NEXT:    [[X1:%.*]] = load double, ptr [[PTRX1]], align 4
+; AVX2-NEXT:    [[Y1:%.*]] = load double, ptr [[PTRY1]], align 4
+; AVX2-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
+; AVX2-NEXT:    [[MUL1:%.*]] = fmul double [[X1]], [[Y1]]
+; AVX2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[DOT01:%.*]] = fadd fast double [[MUL0]], [[MUL1]]
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; AVX2-NEXT:    [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP4]]
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; AVX2-NEXT:    [[DOT0123:%.*]] = fadd fast double [[DOT012]], [[TMP5]]
+; AVX2-NEXT:    ret double [[DOT0123]]
 ;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
@@ -127,12 +162,47 @@ define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(3
 }
 
 define float @dot4f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
-; CHECK-LABEL: @dot4f32_fast(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
-; CHECK-NEXT:    ret float [[TMP4]]
+; SSE2-LABEL: @dot4f32_fast(
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; SSE2-NEXT:    ret float [[TMP4]]
+;
+; SSE4-LABEL: @dot4f32_fast(
+; SSE4-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
+; SSE4-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
+; SSE4-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; SSE4-NEXT:    ret float [[TMP4]]
+;
+; AVX-LABEL: @dot4f32_fast(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
+; AVX-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; AVX-NEXT:    ret float [[TMP4]]
+;
+; AVX2-LABEL: @dot4f32_fast(
+; AVX2-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
+; AVX2-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
+; AVX2-NEXT:    [[PTRX2:%.*]] = getelementptr inbounds float, ptr [[PTRX]], i64 2
+; AVX2-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds float, ptr [[PTRY]], i64 2
+; AVX2-NEXT:    [[X0:%.*]] = load float, ptr [[PTRX]], align 4
+; AVX2-NEXT:    [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
+; AVX2-NEXT:    [[X1:%.*]] = load float, ptr [[PTRX1]], align 4
+; AVX2-NEXT:    [[Y1:%.*]] = load float, ptr [[PTRY1]], align 4
+; AVX2-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
+; AVX2-NEXT:    [[MUL1:%.*]] = fmul float [[X1]], [[Y1]]
+; AVX2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX2]], align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY2]], align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[DOT01:%.*]] = fadd fast float [[MUL0]], [[MUL1]]
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; AVX2-NEXT:    [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP4]]
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; AVX2-NEXT:    [[DOT0123:%.*]] = fadd fast float [[DOT012]], [[TMP5]]
+; AVX2-NEXT:    ret float [[DOT0123]]
 ;
   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
@@ -372,6 +442,18 @@ define double @dot2f64_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(1
 ; AVX-NEXT:    [[DOT01:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    ret double [[DOT01]]
 ;
+; AVX2-LABEL: @dot2f64_fast(
+; AVX2-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
+; AVX2-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
+; AVX2-NEXT:    [[X0:%.*]] = load double, ptr [[PTRX]], align 4
+; AVX2-NEXT:    [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
+; AVX2-NEXT:    [[X1:%.*]] = load double, ptr [[PTRX1]], align 4
+; AVX2-NEXT:    [[Y1:%.*]] = load double, ptr [[PTRY1]], align 4
+; AVX2-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
+; AVX2-NEXT:    [[MUL1:%.*]] = fmul double [[X1]], [[Y1]]
+; AVX2-NEXT:    [[DOT01:%.*]] = fadd fast double [[MUL0]], [[MUL1]]
+; AVX2-NEXT:    ret double [[DOT01]]
+;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
   %x0 = load double, ptr %ptrx, align 4
@@ -410,6 +492,18 @@ define float @dot2f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16
 ; AVX-NEXT:    [[DOT01:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    ret float [[DOT01]]
 ;
+; AVX2-LABEL: @dot2f32_fast(
+; AVX2-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
+; AVX2-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
+; AVX2-NEXT:    [[X0:%.*]] = load float, ptr [[PTRX]], align 4
+; AVX2-NEXT:    [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
+; AVX2-NEXT:    [[X1:%.*]] = load float, ptr [[PTRX1]], align 4
+; AVX2-NEXT:    [[Y1:%.*]] = load float, ptr [[PTRY1]], align 4
+; AVX2-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
+; AVX2-NEXT:    [[MUL1:%.*]] = fmul float [[X1]], [[Y1]]
+; AVX2-NEXT:    [[DOT01:%.*]] = fadd fast float [[MUL0]], [[MUL1]]
+; AVX2-NEXT:    ret float [[DOT01]]
+;
   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
   %x0 = load float, ptr %ptrx, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index eaa77d7..0bbdeb55 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -31,12 +31,9 @@ define float @baz() {
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast float [[CONV]], 2.000000e+00
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[CONV]], i32 0
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 1
-; THRESHOLD-NEXT:    [[TMP7:%.*]] = fmul fast <2 x float> [[TMP6]], splat (float 2.000000e+00)
-; THRESHOLD-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
-; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP4]], 2.000000e+00
 ; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
 ; THRESHOLD-NEXT:    store float [[OP_RDX]], ptr @res, align 4
 ; THRESHOLD-NEXT:    ret float [[OP_RDX]]
@@ -76,14 +73,41 @@ define float @bazz() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
+; CHECK-NEXT:    [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]]
 ; CHECK-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
+; CHECK-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 4), align 16
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 4), align 16
+; CHECK-NEXT:    [[MUL18:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[ADD19:%.*]] = fadd fast float [[MUL18]], [[ADD7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 5), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 5), align 4
+; CHECK-NEXT:    [[MUL18_1:%.*]] = fmul fast float [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[MUL18_1]], [[ADD19]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 6), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 6), align 8
+; CHECK-NEXT:    [[MUL18_2:%.*]] = fmul fast float [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[MUL18_2]], [[ADD19_1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 7), align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 7), align 4
+; CHECK-NEXT:    [[MUL18_3:%.*]] = fmul fast float [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[MUL18_3]], [[ADD19_2]]
 ; CHECK-NEXT:    store float [[OP_RDX1]], ptr @res, align 4
 ; CHECK-NEXT:    ret float [[OP_RDX1]]
 ;
@@ -92,14 +116,41 @@ define float @bazz() {
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
+; THRESHOLD-NEXT:    [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]]
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
+; THRESHOLD-NEXT:    [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
+; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]]
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
+; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]]
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
+; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]]
 ; THRESHOLD-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
 ; THRESHOLD-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
-; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
+; THRESHOLD-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 4), align 16
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 4), align 16
+; THRESHOLD-NEXT:    [[MUL18:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
+; THRESHOLD-NEXT:    [[ADD19:%.*]] = fadd fast float [[MUL18]], [[ADD7]]
+; THRESHOLD-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 5), align 4
+; THRESHOLD-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 5), align 4
+; THRESHOLD-NEXT:    [[MUL18_1:%.*]] = fmul fast float [[TMP12]], [[TMP11]]
+; THRESHOLD-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[MUL18_1]], [[ADD19]]
+; THRESHOLD-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 6), align 8
+; THRESHOLD-NEXT:    [[TMP14:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 6), align 8
+; THRESHOLD-NEXT:    [[MUL18_2:%.*]] = fmul fast float [[TMP14]], [[TMP13]]
+; THRESHOLD-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[MUL18_2]], [[ADD19_1]]
+; THRESHOLD-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 7), align 4
+; THRESHOLD-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 7), align 4
+; THRESHOLD-NEXT:    [[MUL18_3:%.*]] = fmul fast float [[TMP16]], [[TMP15]]
+; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[MUL18_3]], [[ADD19_2]]
 ; THRESHOLD-NEXT:    store float [[OP_RDX1]], ptr @res, align 4
 ; THRESHOLD-NEXT:    ret float [[OP_RDX1]]
 ;
@@ -151,10 +202,21 @@ define float @bazzz() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; CHECK-NEXT:    store float [[TMP5]], ptr @res, align 4
 ; CHECK-NEXT:    ret float [[TMP5]]
@@ -163,10 +225,21 @@ define float @bazzz() {
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
+; THRESHOLD-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
+; THRESHOLD-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
+; THRESHOLD-NEXT:    [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]]
+; THRESHOLD-NEXT:    [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]]
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; THRESHOLD-NEXT:    store float [[TMP5]], ptr @res, align 4
 ; THRESHOLD-NEXT:    ret float [[TMP5]]
@@ -199,10 +272,21 @@ define i32 @foo() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; CHECK-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
 ; CHECK-NEXT:    store i32 [[CONV4]], ptr @n, align 4
@@ -212,10 +296,21 @@ define i32 @foo() {
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
+; THRESHOLD-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
+; THRESHOLD-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
+; THRESHOLD-NEXT:    [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]]
+; THRESHOLD-NEXT:    [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]]
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; THRESHOLD-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
 ; THRESHOLD-NEXT:    store i32 [[CONV4]], ptr @n, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
index 9fbe0a5..ea637bb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -8,119 +8,134 @@
 %"class.3" = type { %"struct.1", i64 }
 %"struct.1" = type { [8 x i64] }
 
-$_ZN1C10SwitchModeEv = comdat any
-
 ; Function Attrs: uwtable
-define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 {
+define void @_ZN1C10SwitchModeEv(ptr %p, i64 %c) {
 ; SSE-LABEL: @_ZN1C10SwitchModeEv(
 ; SSE-NEXT:  for.body.lr.ph.i:
-; SSE-NEXT:    [[OR_1:%.*]] = or i64 undef, 1
-; SSE-NEXT:    store i64 [[OR_1]], ptr undef, align 8
-; SSE-NEXT:    [[FOO_3:%.*]] = load i64, ptr undef, align 8
-; SSE-NEXT:    [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], ptr undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
+; SSE-NEXT:    [[BAR5:%.*]] = or i64 [[C:%.*]], 1
+; SSE-NEXT:    store i64 [[BAR5]], ptr [[FOO_2:%.*]], align 8
 ; SSE-NEXT:    [[FOO_4:%.*]] = load i64, ptr [[FOO_2]], align 8
-; SSE-NEXT:    [[BAR5:%.*]] = load i64, ptr undef, align 8
-; SSE-NEXT:    [[AND_2:%.*]] = and i64 [[OR_1]], [[FOO_3]]
+; SSE-NEXT:    [[FOO_3:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], ptr [[FOO_2]], i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
+; SSE-NEXT:    [[FOO_5:%.*]] = load i64, ptr [[FOO_3]], align 8
+; SSE-NEXT:    [[BAR6:%.*]] = load i64, ptr [[FOO_2]], align 8
 ; SSE-NEXT:    [[AND_1:%.*]] = and i64 [[BAR5]], [[FOO_4]]
-; SSE-NEXT:    store i64 [[AND_2]], ptr undef, align 8
-; SSE-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], ptr undef, i64 0, i32 0, i32 0, i32 0, i64 1
-; SSE-NEXT:    store i64 [[AND_1]], ptr [[BAR4]], align 8
+; SSE-NEXT:    [[AND_2:%.*]] = and i64 [[BAR6]], [[FOO_5]]
+; SSE-NEXT:    store i64 [[AND_1]], ptr [[FOO_2]], align 8
+; SSE-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], ptr [[FOO_2]], i64 0, i32 0, i32 0, i32 0, i64 1
+; SSE-NEXT:    store i64 [[AND_2]], ptr [[BAR4]], align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @_ZN1C10SwitchModeEv(
 ; AVX-NEXT:  for.body.lr.ph.i:
-; AVX-NEXT:    [[OR_1:%.*]] = or i64 undef, 1
-; AVX-NEXT:    store i64 [[OR_1]], ptr undef, align 8
-; AVX-NEXT:    [[BAR5:%.*]] = load i64, ptr undef, align 8
-; AVX-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr undef, align 8
+; AVX-NEXT:    [[OR_1:%.*]] = or i64 [[C:%.*]], 1
+; AVX-NEXT:    store i64 [[OR_1]], ptr [[P:%.*]], align 8
+; AVX-NEXT:    [[BAR5:%.*]] = load i64, ptr [[P]], align 8
+; AVX-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[P]], align 8
 ; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[OR_1]], i32 0
 ; AVX-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[BAR5]], i32 1
 ; AVX-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], [[TMP0]]
-; AVX-NEXT:    store <2 x i64> [[TMP3]], ptr undef, align 8
+; AVX-NEXT:    store <2 x i64> [[TMP3]], ptr [[P]], align 8
 ; AVX-NEXT:    ret void
 ;
 for.body.lr.ph.i:
-  %or.1 = or i64 undef, 1
-  store i64 %or.1, ptr undef, align 8
-  %foo.3 = load i64, ptr undef, align 8
-  %foo.2 = getelementptr inbounds %class.1, ptr undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
+  %or.1 = or i64 %c, 1
+  store i64 %or.1, ptr %p, align 8
+  %foo.3 = load i64, ptr %p, align 8
+  %foo.2 = getelementptr inbounds %class.1, ptr %p, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
   %foo.4 = load i64, ptr %foo.2, align 8
-  %bar5 = load i64, ptr undef, align 8
+  %bar5 = load i64, ptr %p, align 8
   %and.2 = and i64 %or.1, %foo.3
   %and.1 = and i64 %bar5, %foo.4
-  store i64 %and.2, ptr undef, align 8
-  %bar4 = getelementptr inbounds %class.2, ptr undef, i64 0, i32 0, i32 0, i32 0, i64 1
+  store i64 %and.2, ptr %p, align 8
+  %bar4 = getelementptr inbounds %class.2, ptr %p, i64 0, i32 0, i32 0, i32 0, i64 1
   store i64 %and.1, ptr %bar4, align 8
   ret void
 }
 
 ; Function Attrs: norecurse nounwind uwtable
-define void @pr35497() local_unnamed_addr #0 {
+define void @pr35497(ptr %p, i64 %c) {
 ; SSE-LABEL: @pr35497(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = load i64, ptr undef, align 1
-; SSE-NEXT:    [[ADD:%.*]] = add i64 undef, undef
-; SSE-NEXT:    store i64 [[ADD]], ptr undef, align 1
-; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 4
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> <i64 poison, i64 undef>, i64 [[TMP0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 2)
-; SSE-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], splat (i64 20)
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
-; SSE-NEXT:    [[TMP5:%.*]] = add nuw nsw <2 x i64> [[TMP4]], zeroinitializer
-; SSE-NEXT:    store <2 x i64> [[TMP5]], ptr undef, align 1
-; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
-; SSE-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], splat (i64 2)
-; SSE-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], splat (i64 20)
-; SSE-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP5]], splat (i64 6)
-; SSE-NEXT:    [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]]
-; SSE-NEXT:    store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1
+; SSE-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P:%.*]], align 1
+; SSE-NEXT:    [[AND:%.*]] = shl i64 [[TMP0]], 2
+; SSE-NEXT:    [[SHL:%.*]] = and i64 [[AND]], 20
+; SSE-NEXT:    [[ADD:%.*]] = add i64 [[C:%.*]], [[C]]
+; SSE-NEXT:    store i64 [[ADD]], ptr [[P]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], ptr [[P]], i64 0, i64 5
+; SSE-NEXT:    [[AND_1:%.*]] = shl i64 [[C]], 2
+; SSE-NEXT:    [[SHL_1:%.*]] = and i64 [[AND_1]], 20
+; SSE-NEXT:    [[SHR_1:%.*]] = lshr i64 [[C]], 6
+; SSE-NEXT:    [[ADD_1:%.*]] = add nuw nsw i64 [[SHL]], [[SHR_1]]
+; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], ptr [[P]], i64 0, i64 4
+; SSE-NEXT:    [[SHR_2:%.*]] = lshr i64 [[C]], 6
+; SSE-NEXT:    [[ADD_2:%.*]] = add nuw nsw i64 [[SHL_1]], [[SHR_2]]
+; SSE-NEXT:    [[AND_4:%.*]] = shl i64 [[ADD]], 2
+; SSE-NEXT:    [[SHL_4:%.*]] = and i64 [[AND_4]], 20
+; SSE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], ptr [[P]], i64 0, i64 1
+; SSE-NEXT:    store i64 [[ADD_1]], ptr [[ARRAYIDX2_5]], align 1
+; SSE-NEXT:    [[AND_5:%.*]] = shl nuw nsw i64 [[ADD_1]], 2
+; SSE-NEXT:    [[SHL_5:%.*]] = and i64 [[AND_5]], 20
+; SSE-NEXT:    [[SHR_5:%.*]] = lshr i64 [[ADD_1]], 6
+; SSE-NEXT:    [[ADD_5:%.*]] = add nuw nsw i64 [[SHL_4]], [[SHR_5]]
+; SSE-NEXT:    store i64 [[ADD_5]], ptr [[ARRAYIDX2_1]], align 1
+; SSE-NEXT:    store i64 [[ADD_2]], ptr [[P]], align 1
+; SSE-NEXT:    [[SHR_6:%.*]] = lshr i64 [[ADD_2]], 6
+; SSE-NEXT:    [[ADD_6:%.*]] = add nuw nsw i64 [[SHL_5]], [[SHR_6]]
+; SSE-NEXT:    store i64 [[ADD_6]], ptr [[ARRAYIDX2_2]], align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @pr35497(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = load i64, ptr undef, align 1
-; AVX-NEXT:    [[ADD:%.*]] = add i64 undef, undef
-; AVX-NEXT:    store i64 [[ADD]], ptr undef, align 1
-; AVX-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 4
-; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> <i64 undef, i64 poison>, i64 [[TMP0]], i32 1
+; AVX-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P:%.*]], align 1
+; AVX-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[C:%.*]], i32 0
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <2 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP13:%.*]] = lshr <2 x i64> [[TMP11]], splat (i64 6)
+; AVX-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], ptr [[P]], i64 0, i64 4
+; AVX-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], ptr [[P]], i64 0, i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP0]], i32 1
 ; AVX-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 2)
 ; AVX-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], splat (i64 20)
-; AVX-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
-; AVX-NEXT:    store <2 x i64> [[TMP4]], ptr undef, align 1
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
-; AVX-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 1
+; AVX-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP1]], <2 x i32> <i32 1, i32 2>
+; AVX-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i32> <i32 1, i32 3>
+; AVX-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP14]], [[TMP16]]
+; AVX-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; AVX-NEXT:    store i64 [[TMP17]], ptr [[P]], align 1
+; AVX-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], [[TMP13]]
+; AVX-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; AVX-NEXT:    store i64 [[TMP12]], ptr [[ARRAYIDX2_5]], align 1
 ; AVX-NEXT:    [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], splat (i64 2)
 ; AVX-NEXT:    [[TMP8:%.*]] = and <2 x i64> [[TMP7]], splat (i64 20)
+; AVX-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; AVX-NEXT:    store i64 [[TMP15]], ptr [[P]], align 1
 ; AVX-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP4]], splat (i64 6)
 ; AVX-NEXT:    [[TMP10:%.*]] = add nuw nsw <2 x i64> [[TMP8]], [[TMP9]]
 ; AVX-NEXT:    store <2 x i64> [[TMP10]], ptr [[ARRAYIDX2_2]], align 1
 ; AVX-NEXT:    ret void
 ;
 entry:
-  %0 = load i64, ptr undef, align 1
+  %0 = load i64, ptr %p, align 1
   %and = shl i64 %0, 2
   %shl = and i64 %and, 20
-  %add = add i64 undef, undef
-  store i64 %add, ptr undef, align 1
-  %arrayidx2.1 = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 5
-  %and.1 = shl i64 undef, 2
+  %add = add i64 %c, %c
+  store i64 %add, ptr %p, align 1
+  %arrayidx2.1 = getelementptr inbounds [0 x i64], ptr %p, i64 0, i64 5
+  %and.1 = shl i64 %c, 2
   %shl.1 = and i64 %and.1, 20
-  %shr.1 = lshr i64 undef, 6
+  %shr.1 = lshr i64 %c, 6
   %add.1 = add nuw nsw i64 %shl, %shr.1
-  %arrayidx2.2 = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 4
-  %shr.2 = lshr i64 undef, 6
+  %arrayidx2.2 = getelementptr inbounds [0 x i64], ptr %p, i64 0, i64 4
+  %shr.2 = lshr i64 %c, 6
   %add.2 = add nuw nsw i64 %shl.1, %shr.2
   %and.4 = shl i64 %add, 2
   %shl.4 = and i64 %and.4, 20
-  %arrayidx2.5 = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 1
+  %arrayidx2.5 = getelementptr inbounds [0 x i64], ptr %p, i64 0, i64 1
   store i64 %add.1, ptr %arrayidx2.5, align 1
   %and.5 = shl nuw nsw i64 %add.1, 2
   %shl.5 = and i64 %and.5, 20
   %shr.5 = lshr i64 %add.1, 6
   %add.5 = add nuw nsw i64 %shl.4, %shr.5
   store i64 %add.5, ptr %arrayidx2.1, align 1
-  store i64 %add.2, ptr undef, align 1
+  store i64 %add.2, ptr %p, align 1
   %shr.6 = lshr i64 %add.2, 6
   %add.6 = add nuw nsw i64 %shl.5, %shr.6
   store i64 %add.6, ptr %arrayidx2.2, align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
index 1922e935..4527929 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
@@ -10,17 +10,65 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg
 define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, <8 x ptr> [[TMP1]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
-; CHECK-NEXT:    [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x double> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <8 x double> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1
+; CHECK-NEXT:    [[LD1_0:%.*]] = load double, ptr [[GEP1_0]], align 8
+; CHECK-NEXT:    [[LD0_0:%.*]] = load double, ptr [[ARG1:%.*]], align 8
+; CHECK-NEXT:    [[MUL1_0:%.*]] = fmul fast double [[LD0_0]], [[LD1_0]]
+; CHECK-NEXT:    [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 16
+; CHECK-NEXT:    [[LD2_0:%.*]] = load double, ptr [[GEP2_0]], align 8
+; CHECK-NEXT:    [[MUL2_0:%.*]] = fmul fast double [[LD2_0]], [[LD1_0]]
+; CHECK-NEXT:    [[GEP1_1:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 3
+; CHECK-NEXT:    [[LD1_1:%.*]] = load double, ptr [[GEP1_1]], align 8
+; CHECK-NEXT:    [[GEP0_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 1
+; CHECK-NEXT:    [[LD0_1:%.*]] = load double, ptr [[GEP0_1]], align 8
+; CHECK-NEXT:    [[MUL1_1:%.*]] = fmul fast double [[LD0_1]], [[LD1_1]]
+; CHECK-NEXT:    [[RDX1_0:%.*]] = fadd fast double [[MUL1_0]], [[MUL1_1]]
+; CHECK-NEXT:    [[GEP2_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 17
+; CHECK-NEXT:    [[LD2_1:%.*]] = load double, ptr [[GEP2_1]], align 8
+; CHECK-NEXT:    [[MUL2_1:%.*]] = fmul fast double [[LD2_1]], [[LD1_1]]
+; CHECK-NEXT:    [[RDX2_0:%.*]] = fadd fast double [[MUL2_0]], [[MUL2_1]]
+; CHECK-NEXT:    [[GEP1_2:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 5
+; CHECK-NEXT:    [[LD1_2:%.*]] = load double, ptr [[GEP1_2]], align 8
+; CHECK-NEXT:    [[GEP0_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 2
+; CHECK-NEXT:    [[GEP2_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 18
+; CHECK-NEXT:    [[GEP1_3:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 7
+; CHECK-NEXT:    [[LD1_3:%.*]] = load double, ptr [[GEP1_3]], align 8
+; CHECK-NEXT:    [[GEP1_4:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 9
+; CHECK-NEXT:    [[LD1_4:%.*]] = load double, ptr [[GEP1_4]], align 8
+; CHECK-NEXT:    [[GEP1_5:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 11
+; CHECK-NEXT:    [[LD1_5:%.*]] = load double, ptr [[GEP1_5]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[GEP0_2]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[LD1_2]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[LD1_3]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[LD1_4]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[LD1_5]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP0]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[GEP2_2]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <4 x double> [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    [[GEP1_6:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 13
+; CHECK-NEXT:    [[LD1_6:%.*]] = load double, ptr [[GEP1_6]], align 8
+; CHECK-NEXT:    [[GEP0_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 6
+; CHECK-NEXT:    [[LD0_6:%.*]] = load double, ptr [[GEP0_6]], align 8
+; CHECK-NEXT:    [[MUL1_6:%.*]] = fmul fast double [[LD0_6]], [[LD1_6]]
+; CHECK-NEXT:    [[GEP2_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 22
+; CHECK-NEXT:    [[LD2_6:%.*]] = load double, ptr [[GEP2_6]], align 8
+; CHECK-NEXT:    [[MUL2_6:%.*]] = fmul fast double [[LD2_6]], [[LD1_6]]
+; CHECK-NEXT:    [[GEP1_7:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 15
+; CHECK-NEXT:    [[LD1_7:%.*]] = load double, ptr [[GEP1_7]], align 8
+; CHECK-NEXT:    [[GEP0_7:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 7
+; CHECK-NEXT:    [[LD0_7:%.*]] = load double, ptr [[GEP0_7]], align 8
+; CHECK-NEXT:    [[MUL1_7:%.*]] = fmul fast double [[LD0_7]], [[LD1_7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP5]])
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast double [[TMP10]], [[MUL1_6]]
+; CHECK-NEXT:    [[OP_RDX4:%.*]] = fadd fast double [[MUL1_7]], [[RDX1_0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast double [[OP_RDX3]], [[OP_RDX4]]
+; CHECK-NEXT:    [[GEP2_7:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 23
+; CHECK-NEXT:    [[LD2_7:%.*]] = load double, ptr [[GEP2_7]], align 8
+; CHECK-NEXT:    [[MUL2_7:%.*]] = fmul fast double [[LD2_7]], [[LD1_7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP7]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast double [[TMP11]], [[MUL2_6]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast double [[MUL2_7]], [[RDX2_0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast double [[OP_RDX]], [[OP_RDX1]]
 ; CHECK-NEXT:    [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
 ; CHECK-NEXT:    [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
index f0272d5..33c281d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
@@ -6,9 +6,25 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg
 define void @rdx_feeds_single_insert(<2 x double> %v, ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) {
 ; CHECK-LABEL: @rdx_feeds_single_insert(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[ARG1:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <8 x double> [[TMP0]], <double 1.000000e+01, double 1.100000e+01, double 1.200000e+01, double 1.300000e+01, double 1.400000e+01, double 1.500000e+01, double 1.600000e+01, double 1.700000e+01>
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP1]])
+; CHECK-NEXT:    [[LD0_0:%.*]] = load double, ptr [[ARG1:%.*]], align 8
+; CHECK-NEXT:    [[MUL1_0:%.*]] = fmul fast double [[LD0_0]], 1.000000e+01
+; CHECK-NEXT:    [[GEP0_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 1
+; CHECK-NEXT:    [[LD0_1:%.*]] = load double, ptr [[GEP0_1]], align 8
+; CHECK-NEXT:    [[MUL1_1:%.*]] = fmul fast double [[LD0_1]], 1.100000e+01
+; CHECK-NEXT:    [[RDX1_0:%.*]] = fadd fast double [[MUL1_0]], [[MUL1_1]]
+; CHECK-NEXT:    [[GEP0_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[GEP0_2]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <4 x double> [[TMP0]], <double 1.200000e+01, double 1.300000e+01, double 1.400000e+01, double 1.500000e+01>
+; CHECK-NEXT:    [[GEP0_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 6
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[GEP0_6]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <2 x double> [[TMP3]], <double 1.600000e+01, double 1.700000e+01>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[RDX1_0]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast double [[TMP8]], [[TMP9]]
 ; CHECK-NEXT:    [[I:%.*]] = insertelement <2 x double> [[V:%.*]], double [[TMP2]], i64 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true))
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
index 8c9f8b5..359c24b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
@@ -1,27 +1,39 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -S -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=CHECK
-; RUN: opt -passes=slp-vectorizer -S -mcpu=bdver2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=CHECK
-; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s  --check-prefixes=CHECK
+; RUN: opt -passes=slp-vectorizer -S -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=SSE4
+; RUN: opt -passes=slp-vectorizer -S -mcpu=bdver2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX
+; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s  --check-prefixes=AVX
 
 ; This test checks for a case when a horizontal reduction of floating-point
 ; adds may look profitable, but is not because it eliminates generation of
 ; floating-point FMAs that would be more profitable.
 
-; FIXME: We generate a horizontal reduction today.
-
 define void @hr() {
-; CHECK-LABEL: @hr(
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i16 0 to double
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double [[CVT0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]]
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; SSE4-LABEL: @hr(
+; SSE4-NEXT:    br label [[LOOP:%.*]]
+; SSE4:       loop:
+; SSE4-NEXT:    [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ]
+; SSE4-NEXT:    [[CVT0:%.*]] = uitofp i16 0 to double
+; SSE4-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double [[CVT0]], i32 0
+; SSE4-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]]
+; SSE4-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
+; SSE4-NEXT:    [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]]
+; SSE4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
+; SSE4:       exit:
+; SSE4-NEXT:    ret void
+;
+; AVX-LABEL: @hr(
+; AVX-NEXT:    br label [[LOOP:%.*]]
+; AVX:       loop:
+; AVX-NEXT:    [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[ADD3:%.*]], [[LOOP]] ]
+; AVX-NEXT:    [[CVT0:%.*]] = uitofp i16 0 to double
+; AVX-NEXT:    [[MUL0:%.*]] = fmul fast double 0.000000e+00, [[CVT0]]
+; AVX-NEXT:    [[ADD0:%.*]] = fadd fast double [[MUL0]], [[PHI0]]
+; AVX-NEXT:    [[ADD1:%.*]] = fadd fast double 0.000000e+00, [[ADD0]]
+; AVX-NEXT:    [[ADD2:%.*]] = fadd fast double 0.000000e+00, [[ADD1]]
+; AVX-NEXT:    [[ADD3]] = fadd fast double 0.000000e+00, [[ADD2]]
+; AVX-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
+; AVX:       exit:
+; AVX-NEXT:    ret void
 ;
   br label %loop
 
@@ -47,18 +59,27 @@ exit:
 ; may look profitable; but both are not because this eliminates generation
 ; of floating-point FMAs that would be more profitable.
 
-; FIXME: We generate a horizontal reduction today, and if that's disabled, we
-; still vectorize some of the multiplies.
-
 define double @hr_or_mul() {
-; CHECK-LABEL: @hr_or_mul(
-; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i16 3 to double
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast double [[TMP4]], [[CVT0]]
-; CHECK-NEXT:    ret double [[OP_RDX]]
+; SSE4-LABEL: @hr_or_mul(
+; SSE4-NEXT:    [[CVT0:%.*]] = uitofp i16 3 to double
+; SSE4-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
+; SSE4-NEXT:    [[TMP3:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
+; SSE4-NEXT:    [[OP_RDX:%.*]] = fadd fast double [[TMP4]], [[CVT0]]
+; SSE4-NEXT:    ret double [[OP_RDX]]
+;
+; AVX-LABEL: @hr_or_mul(
+; AVX-NEXT:    [[CVT0:%.*]] = uitofp i16 3 to double
+; AVX-NEXT:    [[MUL0:%.*]] = fmul fast double 7.000000e+00, [[CVT0]]
+; AVX-NEXT:    [[ADD0:%.*]] = fadd fast double [[MUL0]], [[CVT0]]
+; AVX-NEXT:    [[MUL1:%.*]] = fmul fast double -4.300000e+01, [[CVT0]]
+; AVX-NEXT:    [[ADD1:%.*]] = fadd fast double [[MUL1]], [[ADD0]]
+; AVX-NEXT:    [[MUL2:%.*]] = fmul fast double 2.200000e-02, [[CVT0]]
+; AVX-NEXT:    [[ADD2:%.*]] = fadd fast double [[MUL2]], [[ADD1]]
+; AVX-NEXT:    [[MUL3:%.*]] = fmul fast double 9.500000e+00, [[CVT0]]
+; AVX-NEXT:    [[ADD3:%.*]] = fadd fast double [[MUL3]], [[ADD2]]
+; AVX-NEXT:    ret double [[ADD3]]
 ;
   %cvt0 = uitofp i16 3 to double
   %mul0 = fmul fast double 7.000000e+00, %cvt0
diff --git a/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll
index a64075d..5fe02cb 100644
--- a/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll
@@ -1,32 +1,57 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %}
 
 define void @test() {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[BODY:%.*]]
-; CHECK:       body:
-; CHECK-NEXT:    [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[PHI1]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <2 x double> <double 0.000000e+00, double undef>, [[TMP8]]
-; CHECK-NEXT:    [[ADD8_I_I:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP9]])
-; CHECK-NEXT:    [[CMP42_I:%.*]] = fcmp fast ole double [[ADD8_I_I]], 0.000000e+00
-; CHECK-NEXT:    br i1 false, label [[BODY]], label [[EXIT:%.*]]
-; CHECK:       exit:
-; CHECK-NEXT:    br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]]
-; CHECK:       if.then135.i:
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp fast olt <2 x double> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i1> <i1 poison, i1 false>, <2 x i1> [[TMP1]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x double> zeroinitializer, <2 x double> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <2 x double> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    br label [[IF_END209_I]]
-; CHECK:       if.end209.i:
-; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x double> [ [[TMP6]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ]
-; CHECK-NEXT:    ret void
+; X86-LABEL: @test(
+; X86-NEXT:  entry:
+; X86-NEXT:    br label [[BODY:%.*]]
+; X86:       body:
+; X86-NEXT:    [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ]
+; X86-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[BODY]] ]
+; X86-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[PHI1]], i32 0
+; X86-NEXT:    [[TMP2:%.*]] = fmul fast <2 x double> <double 0.000000e+00, double undef>, [[TMP1]]
+; X86-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP2]])
+; X86-NEXT:    [[CMP42_I:%.*]] = fcmp fast ole double [[TMP3]], 0.000000e+00
+; X86-NEXT:    br i1 false, label [[BODY]], label [[EXIT:%.*]]
+; X86:       exit:
+; X86-NEXT:    br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]]
+; X86:       if.then135.i:
+; X86-NEXT:    [[TMP4:%.*]] = fcmp fast olt <2 x double> [[TMP0]], zeroinitializer
+; X86-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i1> <i1 poison, i1 false>, <2 x i1> [[TMP4]], <2 x i32> <i32 2, i32 1>
+; X86-NEXT:    [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x double> zeroinitializer, <2 x double> zeroinitializer
+; X86-NEXT:    [[TMP7:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP6]]
+; X86-NEXT:    [[TMP8:%.*]] = fmul fast <2 x double> [[TMP7]], zeroinitializer
+; X86-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP8]], zeroinitializer
+; X86-NEXT:    br label [[IF_END209_I]]
+; X86:       if.end209.i:
+; X86-NEXT:    [[TMP10:%.*]] = phi <2 x double> [ [[TMP9]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ]
+; X86-NEXT:    ret void
+;
+; AARCH64-LABEL: @test(
+; AARCH64-NEXT:  entry:
+; AARCH64-NEXT:    br label [[BODY:%.*]]
+; AARCH64:       body:
+; AARCH64-NEXT:    [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ]
+; AARCH64-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[BODY]] ]
+; AARCH64-NEXT:    [[MUL_I478_I:%.*]] = fmul fast double [[PHI1]], 0.000000e+00
+; AARCH64-NEXT:    [[MUL7_I485_I:%.*]] = fmul fast double undef, 0.000000e+00
+; AARCH64-NEXT:    [[ADD8_I_I:%.*]] = fadd fast double [[MUL_I478_I]], [[MUL7_I485_I]]
+; AARCH64-NEXT:    [[CMP42_I:%.*]] = fcmp fast ole double [[ADD8_I_I]], 0.000000e+00
+; AARCH64-NEXT:    br i1 false, label [[BODY]], label [[EXIT:%.*]]
+; AARCH64:       exit:
+; AARCH64-NEXT:    br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]]
+; AARCH64:       if.then135.i:
+; AARCH64-NEXT:    [[TMP1:%.*]] = fcmp fast olt <2 x double> [[TMP0]], zeroinitializer
+; AARCH64-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i1> <i1 poison, i1 false>, <2 x i1> [[TMP1]], <2 x i32> <i32 2, i32 1>
+; AARCH64-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x double> zeroinitializer, <2 x double> zeroinitializer
+; AARCH64-NEXT:    [[TMP4:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP3]]
+; AARCH64-NEXT:    [[TMP5:%.*]] = fmul fast <2 x double> [[TMP4]], zeroinitializer
+; AARCH64-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], zeroinitializer
+; AARCH64-NEXT:    br label [[IF_END209_I]]
+; AARCH64:       if.end209.i:
+; AARCH64-NEXT:    [[TMP7:%.*]] = phi <2 x double> [ [[TMP6]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ]
+; AARCH64-NEXT:    ret void
 ;
 entry:
   br label %body
diff --git a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
index 1e4b598..b5d74f0b 100644
--- a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
@@ -1,24 +1,45 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=AARCH86 %}
 
 define <4 x double> @test(ptr %p2, double %i1754, double %i1781, double %i1778) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
-; CHECK-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
-; CHECK-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
-; CHECK-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
-; CHECK-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    ret <4 x double> [[TMP7]]
+; X86-LABEL: @test(
+; X86-NEXT:  entry:
+; X86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
+; X86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
+; X86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
+; X86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
+; X86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
+; X86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
+; X86-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
+; X86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
+; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
+; X86-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
+; X86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
+; X86-NEXT:    [[TMP7:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
+; X86-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AARCH86-LABEL: @test(
+; AARCH86-NEXT:  entry:
+; AARCH86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
+; AARCH86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
+; AARCH86-NEXT:    [[I1773:%.*]] = fmul fast double [[I1772]], [[I1754:%.*]]
+; AARCH86-NEXT:    [[I1782:%.*]] = fmul fast double [[I1754]], [[I1754]]
+; AARCH86-NEXT:    [[I1783:%.*]] = fadd fast double [[I1782]], 1.000000e+00
+; AARCH86-NEXT:    [[I1787:%.*]] = fmul fast double [[I1778:%.*]], [[I1754]]
+; AARCH86-NEXT:    [[I1788:%.*]] = fadd fast double [[I1787]], 1.000000e+00
+; AARCH86-NEXT:    [[I1792:%.*]] = fmul fast double [[I1754]], [[I1781:%.*]]
+; AARCH86-NEXT:    [[I1793:%.*]] = fadd fast double [[I1792]], 1.000000e+00
+; AARCH86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
+; AARCH86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
+; AARCH86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781]]
+; AARCH86-NEXT:    [[TMP4:%.*]] = fadd fast double [[I1773]], [[I1797]]
+; AARCH86-NEXT:    [[I1976:%.*]] = insertelement <4 x double> zeroinitializer, double [[I1783]], i64 0
+; AARCH86-NEXT:    [[I1982:%.*]] = insertelement <4 x double> [[I1976]], double [[I1788]], i64 1
+; AARCH86-NEXT:    [[I1988:%.*]] = insertelement <4 x double> [[I1982]], double [[I1793]], i64 2
+; AARCH86-NEXT:    [[I1994:%.*]] = insertelement <4 x double> [[I1988]], double [[TMP4]], i64 3
+; AARCH86-NEXT:    ret <4 x double> [[I1994]]
 ;
 entry:
   %i1771 = getelementptr inbounds double, ptr %p2, i64 54
diff --git a/llvm/test/Transforms/Scalarizer/intrinsics.ll b/llvm/test/Transforms/Scalarizer/intrinsics.ll
index cee44ef..070c765 100644
--- a/llvm/test/Transforms/Scalarizer/intrinsics.ll
+++ b/llvm/test/Transforms/Scalarizer/intrinsics.ll
@@ -8,6 +8,7 @@ declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
 declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
 declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
 declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float>, <2 x i32>)
 
 ; Ternary fp
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
@@ -32,6 +33,8 @@ declare <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float>)
 ; Unary fp operand, int return type
 declare <2 x i32> @llvm.lrint.v2i32.v2f32(<2 x float>)
 declare <2 x i32> @llvm.llrint.v2i32.v2f32(<2 x float>)
+declare <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float>)
+declare <2 x i32> @llvm.llround.v2i32.v2f32(<2 x float>)
 
 ; Bool return type, overloaded on fp operand type
 declare <2 x i1> @llvm.is.fpclass(<2 x float>, i32)
@@ -159,6 +162,22 @@ define <2 x float> @scalarize_powi_v2f32(<2 x float> %x, i32 %y) #0 {
   ret <2 x float> %powi
 }
 
+define <2 x float> @scalarize_ldexp_v2f32(<2 x float> %x, <2 x i32> %y) #0 {
+; CHECK-LABEL: @scalarize_ldexp_v2f32(
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[Y:%.*]] = extractelement <2 x i32> [[Y1:%.*]], i64 0
+; CHECK-NEXT:    [[POWI_I0:%.*]] = call float @llvm.ldexp.f32.i32(float [[X_I0]], i32 [[Y]])
+; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
+; CHECK-NEXT:    [[Y_I1:%.*]] = extractelement <2 x i32> [[Y1]], i64 1
+; CHECK-NEXT:    [[POWI_I1:%.*]] = call float @llvm.ldexp.f32.i32(float [[X_I1]], i32 [[Y_I1]])
+; CHECK-NEXT:    [[POWI_UPTO0:%.*]] = insertelement <2 x float> poison, float [[POWI_I0]], i64 0
+; CHECK-NEXT:    [[POWI:%.*]] = insertelement <2 x float> [[POWI_UPTO0]], float [[POWI_I1]], i64 1
+; CHECK-NEXT:    ret <2 x float> [[POWI]]
+;
+  %powi = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %x, <2 x i32> %y)
+  ret <2 x float> %powi
+}
+
 define <2 x i32> @scalarize_smul_fix_sat_v2i32(<2 x i32> %x) #0 {
 ; CHECK-LABEL: @scalarize_smul_fix_sat_v2i32(
 ; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
@@ -243,6 +262,34 @@ define <2 x i32> @scalarize_llrint(<2 x float> %x) #0 {
   ret <2 x i32> %rnd
 }
 
+define <2 x i32> @scalarize_lround(<2 x float> %x) #0 {
+; CHECK-LABEL: @scalarize_lround(
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[RND_I0:%.*]] = call i32 @llvm.lround.i32.f32(float [[X_I0]])
+; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
+; CHECK-NEXT:    [[RND_I1:%.*]] = call i32 @llvm.lround.i32.f32(float [[X_I1]])
+; CHECK-NEXT:    [[RND_UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[RND_I0]], i64 0
+; CHECK-NEXT:    [[RND:%.*]] = insertelement <2 x i32> [[RND_UPTO0]], i32 [[RND_I1]], i64 1
+; CHECK-NEXT:    ret <2 x i32> [[RND]]
+;
+  %rnd = call <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float> %x)
+  ret <2 x i32> %rnd
+}
+
+define <2 x i32> @scalarize_llround(<2 x float> %x) #0 {
+; CHECK-LABEL: @scalarize_llround(
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[RND_I0:%.*]] = call i32 @llvm.llround.i32.f32(float [[X_I0]])
+; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
+; CHECK-NEXT:    [[RND_I1:%.*]] = call i32 @llvm.llround.i32.f32(float [[X_I1]])
+; CHECK-NEXT:    [[RND_UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[RND_I0]], i64 0
+; CHECK-NEXT:    [[RND:%.*]] = insertelement <2 x i32> [[RND_UPTO0]], i32 [[RND_I1]], i64 1
+; CHECK-NEXT:    ret <2 x i32> [[RND]]
+;
+  %rnd = call <2 x i32> @llvm.llround.v2i32.v2f32(<2 x float> %x)
+  ret <2 x i32> %rnd
+}
+
 define <2 x i1> @scalarize_is_fpclass(<2 x float> %x) #0 {
 ; CHECK-LABEL: @scalarize_is_fpclass(
 ; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
diff --git a/llvm/tools/llvm-objdump/MachODump.cpp b/llvm/tools/llvm-objdump/MachODump.cpp
index d6e29a3..6dccf21 100644
--- a/llvm/tools/llvm-objdump/MachODump.cpp
+++ b/llvm/tools/llvm-objdump/MachODump.cpp
@@ -1749,7 +1749,7 @@ static void DumpLiteralPointerSection(MachOObjectFile *O,
 
     StringRef BytesStr = unwrapOrError(Sect->getContents(), O->getFileName());
 
-    const char *Contents = reinterpret_cast<const char *>(BytesStr.data());
+    const char *Contents = BytesStr.data();
 
     switch (section_type) {
     case MachO::S_CSTRING_LITERALS:
@@ -1965,7 +1965,7 @@ static void DumpSectionContents(StringRef Filename, MachOObjectFile *O,
 
         StringRef BytesStr =
             unwrapOrError(Section.getContents(), O->getFileName());
-        const char *sect = reinterpret_cast<const char *>(BytesStr.data());
+        const char *sect = BytesStr.data();
         uint32_t sect_size = BytesStr.size();
         uint64_t sect_addr = Section.getAddress();
 
@@ -2049,7 +2049,7 @@ static void DumpInfoPlistSectionContents(StringRef Filename,
         outs() << "Contents of (" << SegName << "," << SectName << ") section\n";
       StringRef BytesStr =
           unwrapOrError(Section.getContents(), O->getFileName());
-      const char *sect = reinterpret_cast<const char *>(BytesStr.data());
+      const char *sect = BytesStr.data();
       outs() << format("%.*s", BytesStr.size(), sect) << "\n";
       return;
     }
@@ -3237,7 +3237,7 @@ static const char *GuessCstringPointer(uint64_t ReferenceValue,
           uint64_t object_offset = Sec.offset + sect_offset;
           StringRef MachOContents = info->O->getData();
           uint64_t object_size = MachOContents.size();
-          const char *object_addr = (const char *)MachOContents.data();
+          const char *object_addr = MachOContents.data();
           if (object_offset < object_size) {
             const char *name = object_addr + object_offset;
             return name;
@@ -3258,7 +3258,7 @@ static const char *GuessCstringPointer(uint64_t ReferenceValue,
           uint64_t object_offset = Sec.offset + sect_offset;
           StringRef MachOContents = info->O->getData();
           uint64_t object_size = MachOContents.size();
-          const char *object_addr = (const char *)MachOContents.data();
+          const char *object_addr = MachOContents.data();
           if (object_offset < object_size) {
             const char *name = object_addr + object_offset;
             return name;
@@ -3447,7 +3447,7 @@ static uint64_t GuessPointerPointer(uint64_t ReferenceValue,
           uint64_t object_offset = Sec.offset + sect_offset;
           StringRef MachOContents = info->O->getData();
           uint64_t object_size = MachOContents.size();
-          const char *object_addr = (const char *)MachOContents.data();
+          const char *object_addr = MachOContents.data();
           if (object_offset < object_size) {
             uint64_t pointer_value;
             memcpy(&pointer_value, object_addr + object_offset,
@@ -4350,7 +4350,7 @@ walk_pointer_list_64(const char *listname, const SectionRef S,
   outs() << "Contents of (" << SegName << "," << SectName << ") section\n";
 
   StringRef BytesStr = unwrapOrError(S.getContents(), O->getFileName());
-  const char *Contents = reinterpret_cast<const char *>(BytesStr.data());
+  const char *Contents = BytesStr.data();
 
   for (uint32_t i = 0; i < S.getSize(); i += sizeof(uint64_t)) {
     uint32_t left = S.getSize() - i;
@@ -4399,7 +4399,7 @@ walk_pointer_list_32(const char *listname, const SectionRef S,
   outs() << "Contents of (" << SegName << "," << SectName << ") section\n";
 
   StringRef BytesStr = unwrapOrError(S.getContents(), O->getFileName());
-  const char *Contents = reinterpret_cast<const char *>(BytesStr.data());
+  const char *Contents = BytesStr.data();
 
   for (uint32_t i = 0; i < S.getSize(); i += sizeof(uint32_t)) {
     uint32_t left = S.getSize() - i;
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index c19c698..815759d 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -636,8 +636,14 @@ static bool isCSKYElf(const ObjectFile &Obj) {
   return Elf && Elf->getEMachine() == ELF::EM_CSKY;
 }
 
+static bool isRISCVElf(const ObjectFile &Obj) {
+  const auto *Elf = dyn_cast<ELFObjectFileBase>(&Obj);
+  return Elf && Elf->getEMachine() == ELF::EM_RISCV;
+}
+
 static bool hasMappingSymbols(const ObjectFile &Obj) {
-  return isArmElf(Obj) || isAArch64Elf(Obj) || isCSKYElf(Obj);
+  return isArmElf(Obj) || isAArch64Elf(Obj) || isCSKYElf(Obj) ||
+         isRISCVElf(Obj);
 }
 
 static void printRelocation(formatted_raw_ostream &OS, StringRef FileName,
diff --git a/llvm/unittests/ADT/StringRefTest.cpp b/llvm/unittests/ADT/StringRefTest.cpp
index ec9cdc1..d5f8dc4 100644
--- a/llvm/unittests/ADT/StringRefTest.cpp
+++ b/llvm/unittests/ADT/StringRefTest.cpp
@@ -619,6 +619,19 @@ TEST(StringRefTest, Hashing) {
             hash_value(StringRef("hello world").slice(1, -1)));
 }
 
+TEST(StringRefTest, getAutoSenseRadix) {
+  struct RadixPair {
+    const char *Str;
+    unsigned Expected;
+  } RadixNumbers[] = {{"123", 10}, {"1", 10}, {"0b1", 2}, {"01", 8}, {"0o1", 8},
+                      {"0x1", 16}, {"0", 10}, {"00", 8},  {"", 10}};
+  for (size_t i = 0; i < std::size(RadixNumbers); ++i) {
+    StringRef number = RadixNumbers[i].Str;
+    unsigned radix = getAutoSenseRadix(number);
+    EXPECT_EQ(radix, RadixNumbers[i].Expected);
+  }
+}
+
 struct UnsignedPair {
   const char *Str;
   uint64_t Expected;
diff --git a/llvm/unittests/Support/DebugLogTest.cpp b/llvm/unittests/Support/DebugLogTest.cpp
index c24d1a5..b28c59c 100644
--- a/llvm/unittests/Support/DebugLogTest.cpp
+++ b/llvm/unittests/Support/DebugLogTest.cpp
@@ -6,11 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// This macro is defined in the LLVM build system, but we undefine it here
-// so that we test at least once in-tree the case where __SHORT_FILE__ is not
-// defined.
-#undef __SHORT_FILE__
-
 #include "llvm/Support/DebugLog.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/mlir/cmake/modules/AddMLIR.cmake b/mlir/cmake/modules/AddMLIR.cmake
index ff4269e..14eefb5 100644
--- a/mlir/cmake/modules/AddMLIR.cmake
+++ b/mlir/cmake/modules/AddMLIR.cmake
@@ -388,6 +388,9 @@ function(add_mlir_library name)
 
   if(TARGET ${name})
     target_link_libraries(${name} INTERFACE ${LLVM_COMMON_LIBS})
+    if(ARG_INSTALL_WITH_TOOLCHAIN)
+      set_target_properties(${name} PROPERTIES MLIR_INSTALL_WITH_TOOLCHAIN TRUE)
+    endif()
     if(NOT ARG_DISABLE_INSTALL)
       add_mlir_library_install(${name})
     endif()
@@ -617,28 +620,29 @@ endfunction(add_mlir_aggregate)
 # This is usually done as part of add_mlir_library but is broken out for cases
 # where non-standard library builds can be installed.
 function(add_mlir_library_install name)
-  if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
-  get_target_export_arg(${name} MLIR export_to_mlirtargets UMBRELLA mlir-libraries)
-  install(TARGETS ${name}
-    COMPONENT ${name}
-    ${export_to_mlirtargets}
-    LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
-    ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX}
-    RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
-    # Note that CMake will create a directory like:
-    #   objects-${CMAKE_BUILD_TYPE}/obj.LibName
-    # and put object files there.
-    OBJECTS DESTINATION lib${LLVM_LIBDIR_SUFFIX}
-  )
+  get_target_property(_install_with_toolchain ${name} MLIR_INSTALL_WITH_TOOLCHAIN)
+  if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY OR _install_with_toolchain)
+    get_target_export_arg(${name} MLIR export_to_mlirtargets UMBRELLA mlir-libraries)
+    install(TARGETS ${name}
+      COMPONENT ${name}
+      ${export_to_mlirtargets}
+      LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+      ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+      RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+      # Note that CMake will create a directory like:
+      #   objects-${CMAKE_BUILD_TYPE}/obj.LibName
+      # and put object files there.
+      OBJECTS DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+    )
 
-  if (NOT LLVM_ENABLE_IDE)
-    add_llvm_install_targets(install-${name}
-                            DEPENDS ${name}
-                            COMPONENT ${name})
-  endif()
-  set_property(GLOBAL APPEND PROPERTY MLIR_ALL_LIBS ${name})
+    if (NOT LLVM_ENABLE_IDE)
+      add_llvm_install_targets(install-${name}
+                              DEPENDS ${name}
+                              COMPONENT ${name})
+    endif()
+    set_property(GLOBAL APPEND PROPERTY MLIR_ALL_LIBS ${name})
+    set_property(GLOBAL APPEND PROPERTY MLIR_EXPORTS ${name})
   endif()
-  set_property(GLOBAL APPEND PROPERTY MLIR_EXPORTS ${name})
 endfunction()
 
 # Declare an mlir library which is part of the public C-API.
diff --git a/mlir/include/mlir/Dialect/CommonFolders.h b/mlir/include/mlir/Dialect/CommonFolders.h
index b5a1242..1137651 100644
--- a/mlir/include/mlir/Dialect/CommonFolders.h
+++ b/mlir/include/mlir/Dialect/CommonFolders.h
@@ -15,10 +15,16 @@
 #ifndef MLIR_DIALECT_COMMONFOLDERS_H
 #define MLIR_DIALECT_COMMONFOLDERS_H
 
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/Types.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+
+#include <cassert>
+#include <cstddef>
 #include <optional>
 
 namespace mlir {
@@ -30,11 +36,13 @@ class PoisonAttr;
 /// Uses `resultType` for the type of the returned attribute.
 /// Optional PoisonAttr template argument allows to specify 'poison' attribute
 /// which will be directly propagated to result.
-template <class AttrElementT,
+template <class AttrElementT, //
           class ElementValueT = typename AttrElementT::ValueType,
           class PoisonAttr = ub::PoisonAttr,
+          class ResultAttrElementT = AttrElementT,
+          class ResultElementValueT = typename ResultAttrElementT::ValueType,
           class CalculationT = function_ref<
-              std::optional<ElementValueT>(ElementValueT, ElementValueT)>>
+              std::optional<ResultElementValueT>(ElementValueT, ElementValueT)>>
 Attribute constFoldBinaryOpConditional(ArrayRef<Attribute> operands,
                                        Type resultType,
                                        CalculationT &&calculate) {
@@ -65,7 +73,7 @@ Attribute constFoldBinaryOpConditional(ArrayRef<Attribute> operands,
     if (!calRes)
       return {};
 
-    return AttrElementT::get(resultType, *calRes);
+    return ResultAttrElementT::get(resultType, *calRes);
   }
 
   if (isa<SplatElementsAttr>(operands[0]) &&
@@ -99,7 +107,7 @@ Attribute constFoldBinaryOpConditional(ArrayRef<Attribute> operands,
       return {};
     auto lhsIt = *maybeLhsIt;
     auto rhsIt = *maybeRhsIt;
-    SmallVector<ElementValueT, 4> elementResults;
+    SmallVector<ResultElementValueT, 4> elementResults;
     elementResults.reserve(lhs.getNumElements());
     for (size_t i = 0, e = lhs.getNumElements(); i < e; ++i, ++lhsIt, ++rhsIt) {
       auto elementResult = calculate(*lhsIt, *rhsIt);
@@ -119,11 +127,13 @@ Attribute constFoldBinaryOpConditional(ArrayRef<Attribute> operands,
 /// attribute.
 /// Optional PoisonAttr template argument allows to specify 'poison' attribute
 /// which will be directly propagated to result.
-template <class AttrElementT,
+template <class AttrElementT, //
           class ElementValueT = typename AttrElementT::ValueType,
           class PoisonAttr = ub::PoisonAttr,
+          class ResultAttrElementT = AttrElementT,
+          class ResultElementValueT = typename ResultAttrElementT::ValueType,
           class CalculationT = function_ref<
-              std::optional<ElementValueT>(ElementValueT, ElementValueT)>>
+              std::optional<ResultElementValueT>(ElementValueT, ElementValueT)>>
 Attribute constFoldBinaryOpConditional(ArrayRef<Attribute> operands,
                                        CalculationT &&calculate) {
   assert(operands.size() == 2 && "binary op takes two operands");
@@ -139,64 +149,73 @@ Attribute constFoldBinaryOpConditional(ArrayRef<Attribute> operands,
       return operands[1];
   }
 
-  auto getResultType = [](Attribute attr) -> Type {
+  auto getAttrType = [](Attribute attr) -> Type {
     if (auto typed = dyn_cast_or_null<TypedAttr>(attr))
       return typed.getType();
     return {};
   };
 
-  Type lhsType = getResultType(operands[0]);
-  Type rhsType = getResultType(operands[1]);
+  Type lhsType = getAttrType(operands[0]);
+  Type rhsType = getAttrType(operands[1]);
   if (!lhsType || !rhsType)
     return {};
   if (lhsType != rhsType)
     return {};
 
   return constFoldBinaryOpConditional<AttrElementT, ElementValueT, PoisonAttr,
+                                      ResultAttrElementT, ResultElementValueT,
                                       CalculationT>(
       operands, lhsType, std::forward<CalculationT>(calculate));
 }
 
 template <class AttrElementT,
           class ElementValueT = typename AttrElementT::ValueType,
-          class PoisonAttr = void,
+          class PoisonAttr = void, //
+          class ResultAttrElementT = AttrElementT,
+          class ResultElementValueT = typename ResultAttrElementT::ValueType,
           class CalculationT =
-              function_ref<ElementValueT(ElementValueT, ElementValueT)>>
+              function_ref<ResultElementValueT(ElementValueT, ElementValueT)>>
 Attribute constFoldBinaryOp(ArrayRef<Attribute> operands, Type resultType,
                             CalculationT &&calculate) {
-  return constFoldBinaryOpConditional<AttrElementT, ElementValueT, PoisonAttr>(
+  return constFoldBinaryOpConditional<AttrElementT, ElementValueT, PoisonAttr,
+                                      ResultAttrElementT>(
       operands, resultType,
-      [&](ElementValueT a, ElementValueT b) -> std::optional<ElementValueT> {
-        return calculate(a, b);
-      });
+      [&](ElementValueT a, ElementValueT b)
+          -> std::optional<ResultElementValueT> { return calculate(a, b); });
 }
 
-template <class AttrElementT,
+template <class AttrElementT, //
           class ElementValueT = typename AttrElementT::ValueType,
           class PoisonAttr = ub::PoisonAttr,
+          class ResultAttrElementT = AttrElementT,
+          class ResultElementValueT = typename ResultAttrElementT::ValueType,
           class CalculationT =
-              function_ref<ElementValueT(ElementValueT, ElementValueT)>>
+              function_ref<ResultElementValueT(ElementValueT, ElementValueT)>>
 Attribute constFoldBinaryOp(ArrayRef<Attribute> operands,
                             CalculationT &&calculate) {
-  return constFoldBinaryOpConditional<AttrElementT, ElementValueT, PoisonAttr>(
+  return constFoldBinaryOpConditional<AttrElementT, ElementValueT, PoisonAttr,
+                                      ResultAttrElementT>(
       operands,
-      [&](ElementValueT a, ElementValueT b) -> std::optional<ElementValueT> {
-        return calculate(a, b);
-      });
+      [&](ElementValueT a, ElementValueT b)
+          -> std::optional<ResultElementValueT> { return calculate(a, b); });
 }
 
 /// Performs constant folding `calculate` with element-wise behavior on the one
 /// attributes in `operands` and returns the result if possible.
+/// Uses `resultType` for the type of the returned attribute.
 /// Optional PoisonAttr template argument allows to specify 'poison' attribute
 /// which will be directly propagated to result.
-template <class AttrElementT,
+template <class AttrElementT, //
           class ElementValueT = typename AttrElementT::ValueType,
           class PoisonAttr = ub::PoisonAttr,
+          class ResultAttrElementT = AttrElementT,
+          class ResultElementValueT = typename ResultAttrElementT::ValueType,
           class CalculationT =
-              function_ref<std::optional<ElementValueT>(ElementValueT)>>
+              function_ref<std::optional<ResultElementValueT>(ElementValueT)>>
 Attribute constFoldUnaryOpConditional(ArrayRef<Attribute> operands,
+                                      Type resultType,
                                       CalculationT &&calculate) {
-  if (!llvm::getSingleElement(operands))
+  if (!resultType || !llvm::getSingleElement(operands))
     return {};
 
   static_assert(
@@ -214,7 +233,7 @@ Attribute constFoldUnaryOpConditional(ArrayRef<Attribute> operands,
     auto res = calculate(op.getValue());
     if (!res)
       return {};
-    return AttrElementT::get(op.getType(), *res);
+    return ResultAttrElementT::get(resultType, *res);
   }
   if (isa<SplatElementsAttr>(operands[0])) {
     // Both operands are splats so we can avoid expanding the values out and
@@ -224,7 +243,7 @@ Attribute constFoldUnaryOpConditional(ArrayRef<Attribute> operands,
     auto elementResult = calculate(op.getSplatValue<ElementValueT>());
     if (!elementResult)
       return {};
-    return DenseElementsAttr::get(op.getType(), *elementResult);
+    return DenseElementsAttr::get(cast<ShapedType>(resultType), *elementResult);
   } else if (isa<ElementsAttr>(operands[0])) {
     // Operands are ElementsAttr-derived; perform an element-wise fold by
     // expanding the values.
@@ -234,7 +253,7 @@ Attribute constFoldUnaryOpConditional(ArrayRef<Attribute> operands,
     if (!maybeOpIt)
       return {};
     auto opIt = *maybeOpIt;
-    SmallVector<ElementValueT> elementResults;
+    SmallVector<ResultElementValueT> elementResults;
     elementResults.reserve(op.getNumElements());
     for (size_t i = 0, e = op.getNumElements(); i < e; ++i, ++opIt) {
       auto elementResult = calculate(*opIt);
@@ -242,19 +261,81 @@ Attribute constFoldUnaryOpConditional(ArrayRef<Attribute> operands,
         return {};
       elementResults.push_back(*elementResult);
     }
-    return DenseElementsAttr::get(op.getShapedType(), elementResults);
+    return DenseElementsAttr::get(cast<ShapedType>(resultType), elementResults);
   }
   return {};
 }
 
-template <class AttrElementT,
+/// Performs constant folding `calculate` with element-wise behavior on the one
+/// attributes in `operands` and returns the result if possible.
+/// Uses the operand element type for the element type of the returned
+/// attribute.
+/// Optional PoisonAttr template argument allows to specify 'poison' attribute
+/// which will be directly propagated to result.
+template <class AttrElementT, //
+          class ElementValueT = typename AttrElementT::ValueType,
+          class PoisonAttr = ub::PoisonAttr,
+          class ResultAttrElementT = AttrElementT,
+          class ResultElementValueT = typename ResultAttrElementT::ValueType,
+          class CalculationT =
+              function_ref<std::optional<ResultElementValueT>(ElementValueT)>>
+Attribute constFoldUnaryOpConditional(ArrayRef<Attribute> operands,
+                                      CalculationT &&calculate) {
+  if (!llvm::getSingleElement(operands))
+    return {};
+
+  static_assert(
+      std::is_void_v<PoisonAttr> || !llvm::is_incomplete_v<PoisonAttr>,
+      "PoisonAttr is undefined, either add a dependency on UB dialect or pass "
+      "void as template argument to opt-out from poison semantics.");
+  if constexpr (!std::is_void_v<PoisonAttr>) {
+    if (isa<PoisonAttr>(operands[0]))
+      return operands[0];
+  }
+
+  auto getAttrType = [](Attribute attr) -> Type {
+    if (auto typed = dyn_cast_or_null<TypedAttr>(attr))
+      return typed.getType();
+    return {};
+  };
+
+  Type operandType = getAttrType(operands[0]);
+  if (!operandType)
+    return {};
+
+  return constFoldUnaryOpConditional<AttrElementT, ElementValueT, PoisonAttr,
+                                     ResultAttrElementT, ResultElementValueT,
+                                     CalculationT>(
+      operands, operandType, std::forward<CalculationT>(calculate));
+}
+
+template <class AttrElementT, //
+          class ElementValueT = typename AttrElementT::ValueType,
+          class PoisonAttr = ub::PoisonAttr,
+          class ResultAttrElementT = AttrElementT,
+          class ResultElementValueT = typename ResultAttrElementT::ValueType,
+          class CalculationT = function_ref<ResultElementValueT(ElementValueT)>>
+Attribute constFoldUnaryOp(ArrayRef<Attribute> operands, Type resultType,
+                           CalculationT &&calculate) {
+  return constFoldUnaryOpConditional<AttrElementT, ElementValueT, PoisonAttr,
+                                     ResultAttrElementT>(
+      operands, resultType,
+      [&](ElementValueT a) -> std::optional<ResultElementValueT> {
+        return calculate(a);
+      });
+}
+
+template <class AttrElementT, //
           class ElementValueT = typename AttrElementT::ValueType,
           class PoisonAttr = ub::PoisonAttr,
-          class CalculationT = function_ref<ElementValueT(ElementValueT)>>
+          class ResultAttrElementT = AttrElementT,
+          class ResultElementValueT = typename ResultAttrElementT::ValueType,
+          class CalculationT = function_ref<ResultElementValueT(ElementValueT)>>
 Attribute constFoldUnaryOp(ArrayRef<Attribute> operands,
                            CalculationT &&calculate) {
-  return constFoldUnaryOpConditional<AttrElementT, ElementValueT, PoisonAttr>(
-      operands, [&](ElementValueT a) -> std::optional<ElementValueT> {
+  return constFoldUnaryOpConditional<AttrElementT, ElementValueT, PoisonAttr,
+                                     ResultAttrElementT>(
+      operands, [&](ElementValueT a) -> std::optional<ResultElementValueT> {
         return calculate(a);
       });
 }
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index a2354e2..90da243 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -189,6 +189,20 @@ def ROCDL_BallotOp :
   let assemblyFormat = "$pred attr-dict `:` type($res)";
 }
 
+def ROCDL_ReadfirstlaneOp : ROCDL_IntrOp<"readfirstlane", [], [0], [AllTypesMatch<["res", "src"]>], 1>,
+  Arguments<(ins LLVM_Type:$src)> {
+  let results = (outs LLVM_Type:$res);
+  let summary = "Get the value in first active lane.";
+
+  let description = [{
+    Returns the value in the lowest active lane of the input operand.
+  }];
+
+  let assemblyFormat = [{
+    $src attr-dict `:` type($res)
+  }];
+}
+
 def ROCDL_ReadlaneOp : ROCDL_IntrOp<"readlane", [], [0], [AllTypesMatch<["res", "src0"]>], 1>,
   Arguments<(ins LLVM_Type:$src0,
                  I32:$src1)> {
@@ -201,7 +215,7 @@ def ROCDL_ReadlaneOp : ROCDL_IntrOp<"readlane", [], [0], [AllTypesMatch<["res",
 
   let assemblyFormat = [{
     $src0 `,` $src1  attr-dict `:` `(` type($src0) `,` type($src1) `)` `->` type($res)
-   }];
+  }];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
index ba73cfb..9f1e88a0 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
@@ -474,7 +474,7 @@ def LinalgStructuredInterface
         int64_t resultIndex =
             opOperand->getOperandNumber() - $_op.getNumDpsInputs();
         assert(resultIndex >= 0 &&
-               resultIndex < this->getOperation()->getNumResults());
+               resultIndex < $_op.getNumDpsInits());
         Operation *yieldOp = getBlock()->getTerminator();
         return &yieldOp->getOpOperand(resultIndex);
       }]
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index 373842c..f236629 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -89,6 +89,45 @@ def LinalgInlineScalarOperandsPass : Pass<"linalg-inline-scalar-operands"> {
   ];
 }
 
+def LinalgMorphOpsPass : Pass<"linalg-morph-ops"> {
+  let summary = "Convert named op to category ops or generic and vice-versa";
+
+  let description = [{
+    Convert a linalg op from one representation to another equivalent.
+    For example, a linalg named op `linalg.add` can also be written as an
+    category op `linalg.elementwise`, and can also be re-written as
+    a `linalg.generic`, giving the morphism:
+
+      named-op <--> category_op (elementwise, contraction, ..) <--> generic
+
+    Note that the set of `linalg.generic` subsumes named and category ops
+    and therefore not all `linalg.genric` can be converted to  named or
+    category op. Similarly, catgory ops subsume named ops.
+
+    Note:
+     Legacy converters:
+     `--linalg-generalize-named-ops` is the path `named-op --> generic-op`
+     `--linalg-specialize-generic-ops` is the path `named-op <-- generic-op`
+  }];
+  let dependentDialects = ["linalg::LinalgDialect"];
+
+  let options = [
+    // named-op <--> category <--> generic
+
+    // Lowering options
+    Option<"namedToCategory", "named-to-category", "bool", /*default=*/"false",
+           "convert named ops to category op e.g. `linalg.elementwise`">,
+    Option<"categoryToGeneric", "category-to-generic", "bool", /*default=*/"false",
+           "convert category ops e.g. `linalg.elementwise` to `linalg.generic`">,
+    Option<"namedToGeneric", "named-to-generic", "bool", /*default=*/"false",
+           "convert named ops e.g. `linalg.add` to `linalg.generic`">,
+
+    // Lifting options
+    //  TODOs: `generic-to-category`, `category-to-named`
+    Option<"genericToNamed", "generic-to-named", "bool", /*default=*/"false",
+           "convert linalg.generic to equivalent named ops"> ];
+}
+
 def LinalgGeneralizeNamedOpsPass : Pass<"linalg-generalize-named-ops"> {
   let summary = "Convert named ops into generic ops";
   let dependentDialects = ["linalg::LinalgDialect"];
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index d4ffe0a..1e5b5d4 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -1831,6 +1831,10 @@ void populateLinalgNamedOpsGeneralizationPatterns(RewritePatternSet &patterns);
 void populateLinalgGenericOpsSpecializationPatterns(
     RewritePatternSet &patterns);
 
+/// Populates `patterns` that convert linalg named ops e.g. `linalg.add`
+/// to equivalent `linalg.elementwise`.
+void populateLinalgNamedToElementwisePatterns(RewritePatternSet &patterns);
+
 /// Populates `patterns` with patterns that fold operations like
 /// `linalg.transform` into elementwise op map.
 void populateLinalgFoldIntoElementwisePatterns(RewritePatternSet &patterns);
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
index 364c172..63410b8 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
@@ -32,6 +32,7 @@
 #include "mlir/Interfaces/ViewLikeInterface.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Alignment.h"
 
 // Pull in all enum type definitions and utility function declarations.
 #include "mlir/Dialect/Vector/IR/VectorEnums.h.inc"
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index dc55704..eeedf68 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -1729,18 +1729,18 @@ def Vector_LoadOp : Vector_Op<"load", [
                    "Value":$base,
                    "ValueRange":$indices,
                    CArg<"bool", "false">:$nontemporal,
-                   CArg<"uint64_t", "0">:$alignment), [{
+                   CArg<"llvm::Align", "llvm::Align()">:$alignment), [{
       return build($_builder, $_state, resultType, base, indices, nontemporal,
-                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
+                   alignment != llvm::Align() ? $_builder.getI64IntegerAttr(alignment.value()) :
                                     nullptr);
     }]>,
     OpBuilder<(ins "TypeRange":$resultTypes,
                    "Value":$base,
                    "ValueRange":$indices,
                    CArg<"bool", "false">:$nontemporal,
-                   CArg<"uint64_t", "0">:$alignment), [{
+                   CArg<"llvm::Align", "llvm::Align()">:$alignment), [{
       return build($_builder, $_state, resultTypes, base, indices, nontemporal,
-                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
+                   alignment != llvm::Align() ? $_builder.getI64IntegerAttr(alignment.value()) :
                                     nullptr);
     }]>
   ];
@@ -1847,9 +1847,9 @@ def Vector_StoreOp : Vector_Op<"store", [
                    "Value":$base,
                    "ValueRange":$indices,
                    CArg<"bool", "false">:$nontemporal,
-                   CArg<"uint64_t", "0">:$alignment), [{
+                   CArg<"llvm::Align", "llvm::Align()">:$alignment), [{
       return build($_builder, $_state, valueToStore, base, indices, nontemporal,
-                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
+                   alignment != llvm::Align() ? $_builder.getI64IntegerAttr(alignment.value()) :
                                     nullptr);
     }]>
   ];
diff --git a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
index c7ecd83..2e00b42 100644
--- a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
+++ b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
@@ -22,6 +22,7 @@
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Visitors.h"
 #include <cassert>
+#include <limits>
 #include <optional>
 
 #define DEBUG_TYPE "memref-to-spirv-pattern"
@@ -475,7 +476,12 @@ struct MemoryRequirements {
 /// Given an accessed SPIR-V pointer, calculates its alignment requirements, if
 /// any.
 static FailureOr<MemoryRequirements>
-calculateMemoryRequirements(Value accessedPtr, bool isNontemporal) {
+calculateMemoryRequirements(Value accessedPtr, bool isNontemporal,
+                            uint64_t preferredAlignment) {
+  if (preferredAlignment >= std::numeric_limits<uint32_t>::max()) {
+    return failure();
+  }
+
   MLIRContext *ctx = accessedPtr.getContext();
 
   auto memoryAccess = spirv::MemoryAccess::None;
@@ -484,7 +490,10 @@ calculateMemoryRequirements(Value accessedPtr, bool isNontemporal) {
   }
 
   auto ptrType = cast<spirv::PointerType>(accessedPtr.getType());
-  if (ptrType.getStorageClass() != spirv::StorageClass::PhysicalStorageBuffer) {
+  bool mayOmitAlignment =
+      !preferredAlignment &&
+      ptrType.getStorageClass() != spirv::StorageClass::PhysicalStorageBuffer;
+  if (mayOmitAlignment) {
     if (memoryAccess == spirv::MemoryAccess::None) {
       return MemoryRequirements{spirv::MemoryAccessAttr{}, IntegerAttr{}};
     }
@@ -493,6 +502,7 @@ calculateMemoryRequirements(Value accessedPtr, bool isNontemporal) {
   }
 
   // PhysicalStorageBuffers require the `Aligned` attribute.
+  // Other storage types may show an `Aligned` attribute.
   auto pointeeType = dyn_cast<spirv::ScalarType>(ptrType.getPointeeType());
   if (!pointeeType)
     return failure();
@@ -504,7 +514,8 @@ calculateMemoryRequirements(Value accessedPtr, bool isNontemporal) {
 
   memoryAccess = memoryAccess | spirv::MemoryAccess::Aligned;
   auto memAccessAttr = spirv::MemoryAccessAttr::get(ctx, memoryAccess);
-  auto alignment = IntegerAttr::get(IntegerType::get(ctx, 32), *sizeInBytes);
+  auto alignmentValue = preferredAlignment ? preferredAlignment : *sizeInBytes;
+  auto alignment = IntegerAttr::get(IntegerType::get(ctx, 32), alignmentValue);
   return MemoryRequirements{memAccessAttr, alignment};
 }
 
@@ -518,16 +529,9 @@ calculateMemoryRequirements(Value accessedPtr, LoadOrStoreOp loadOrStoreOp) {
       llvm::is_one_of<LoadOrStoreOp, memref::LoadOp, memref::StoreOp>::value,
       "Must be called on either memref::LoadOp or memref::StoreOp");
 
-  Operation *memrefAccessOp = loadOrStoreOp.getOperation();
-  auto memrefMemAccess = memrefAccessOp->getAttrOfType<spirv::MemoryAccessAttr>(
-      spirv::attributeName<spirv::MemoryAccess>());
-  auto memrefAlignment =
-      memrefAccessOp->getAttrOfType<IntegerAttr>("alignment");
-  if (memrefMemAccess && memrefAlignment)
-    return MemoryRequirements{memrefMemAccess, memrefAlignment};
-
   return calculateMemoryRequirements(accessedPtr,
-                                     loadOrStoreOp.getNontemporal());
+                                     loadOrStoreOp.getNontemporal(),
+                                     loadOrStoreOp.getAlignment().value_or(0));
 }
 
 LogicalResult
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index e6a3154..e79da92 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -114,11 +114,8 @@ bool mlir::emitc::isIntegerIndexOrOpaqueType(Type type) {
 bool mlir::emitc::isSupportedFloatType(Type type) {
   if (auto floatType = llvm::dyn_cast<FloatType>(type)) {
     switch (floatType.getWidth()) {
-    case 16: {
-      if (llvm::isa<Float16Type, BFloat16Type>(type))
-        return true;
-      return false;
-    }
+    case 16:
+      return llvm::isa<Float16Type, BFloat16Type>(type);
     case 32:
     case 64:
       return true;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
index 70f846e..6ec2e9fd 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -23,9 +23,11 @@ add_mlir_dialect_library(MLIRLinalgTransforms
   InlineScalarOperands.cpp
   Interchange.cpp
   Loops.cpp
+  MorphOps.cpp
   TransposeMatmul.cpp
   ShardingInterfaceImpl.cpp
   NamedOpConversions.cpp
+  NamedToElementwise.cpp
   BlockPackMatmul.cpp
   PackAndUnpackPatterns.cpp
   Padding.cpp
diff --git a/mlir/lib/Dialect/Linalg/Transforms/MorphOps.cpp b/mlir/lib/Dialect/Linalg/Transforms/MorphOps.cpp
new file mode 100644
index 0000000..f261ccb
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/Transforms/MorphOps.cpp
@@ -0,0 +1,62 @@
+//===- MorphOps.cpp - conversion between named,category and generic ops ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements conversions between linalg ops:
+//    named <--> category (elementwise, contraction, ..) <--> generic.
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_LINALGMORPHOPSPASS
+#include "mlir/Dialect/Linalg/Passes.h.inc"
+} // namespace mlir
+
+#define DEBUG_TYPE "linalg-morphism"
+
+using namespace mlir;
+using namespace mlir::linalg;
+
+namespace {
+struct LinalgMorphOpsPass
+    : public impl::LinalgMorphOpsPassBase<LinalgMorphOpsPass> {
+
+  using impl::LinalgMorphOpsPassBase<
+      LinalgMorphOpsPass>::LinalgMorphOpsPassBase;
+
+  void runOnOperation() override;
+};
+
+void LinalgMorphOpsPass::runOnOperation() {
+
+  RewritePatternSet patterns(&getContext());
+
+  // Lowering paths (named -> category -> generic)
+  if (namedToCategory) {
+    populateLinalgNamedToElementwisePatterns(patterns);
+  }
+  if (namedToGeneric || categoryToGeneric) {
+    populateLinalgNamedOpsGeneralizationPatterns(patterns);
+  }
+
+  // Lifting paths (named <- category <- generic)
+  if (genericToNamed) {
+    populateLinalgGenericOpsSpecializationPatterns(patterns);
+  }
+
+  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
+    signalPassFailure();
+}
+} // namespace
diff --git a/mlir/lib/Dialect/Linalg/Transforms/NamedToElementwise.cpp b/mlir/lib/Dialect/Linalg/Transforms/NamedToElementwise.cpp
new file mode 100644
index 0000000..00a076b
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/Transforms/NamedToElementwise.cpp
@@ -0,0 +1,98 @@
+//===- NamedToElementwise.cpp - convert linalg named op into elementwise --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements rewriting those linalg named ops that are essentially
+// elementwise e.g. `linalg.exp`, to `linalg.elementwise`. This allows further
+// optimization on `linalg.elementwise` such as folding transpose, broadcast.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+using namespace mlir;
+using namespace mlir::linalg;
+
+#define DEBUG_TYPE "linalg-named-to-elementwise"
+
+namespace {
+ElementwiseKind getKind(Operation *op) {
+  return llvm::TypeSwitch<Operation *, ElementwiseKind>(op)
+      .Case([](SelectOp) { return ElementwiseKind::select; })
+      .Case([](AddOp) { return ElementwiseKind::add; })
+      .Case([](SubOp) { return ElementwiseKind::sub; })
+      .Case([](MulOp) { return ElementwiseKind::mul; })
+      .Case([](DivOp) { return ElementwiseKind::div; })
+      .Case([](DivUnsignedOp) { return ElementwiseKind::div_unsigned; })
+      .Case([](PowFOp) { return ElementwiseKind::powf; })
+      .Case([](ExpOp) { return ElementwiseKind::exp; })
+      .Case([](LogOp) { return ElementwiseKind::log; })
+      .Case([](AbsOp) { return ElementwiseKind::abs; })
+      .Case([](CeilOp) { return ElementwiseKind::ceil; })
+      .Case([](FloorOp) { return ElementwiseKind::floor; })
+      .Case([](NegFOp) { return ElementwiseKind::negf; })
+      .Case([](ReciprocalOp) { return ElementwiseKind::reciprocal; })
+      .Case([](RoundOp) { return ElementwiseKind::round; })
+      .Case([](SqrtOp) { return ElementwiseKind::sqrt; })
+      .Case([](RsqrtOp) { return ElementwiseKind::rsqrt; })
+      .Case([](SquareOp) { return ElementwiseKind::square; })
+      .Case([](TanhOp) { return ElementwiseKind::tanh; })
+      .Case([](ErfOp) { return ElementwiseKind::erf; })
+      .Default([&](Operation *op) {
+        llvm_unreachable("unhandled case in named to elementwise");
+        return ElementwiseKind::sub;
+      });
+}
+
+template <typename NamedOpTy>
+struct NamedToElementwisePattern : public OpRewritePattern<NamedOpTy> {
+  using OpRewritePattern<NamedOpTy>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(NamedOpTy op,
+                                PatternRewriter &rewriter) const override {
+    SmallVector<NamedAttribute> attrs;
+    auto kindAttr = ElementwiseKindAttr::get(op.getContext(), getKind(op));
+    attrs.push_back(rewriter.getNamedAttr("kind", kindAttr));
+    attrs.push_back(
+        rewriter.getNamedAttr("indexing_maps", op.getIndexingMaps()));
+
+    rewriter.replaceOpWithNewOp<ElementwiseOp>(op, op.getDpsInputs(),
+                                               op.getDpsInits(), attrs);
+    return success();
+  }
+};
+} // namespace
+
+void mlir::linalg::populateLinalgNamedToElementwisePatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<NamedToElementwisePattern<SelectOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<AddOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<SubOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<MulOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<DivOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<DivUnsignedOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<PowFOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<ExpOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<LogOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<AbsOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<CeilOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<FloorOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<NegFOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<ReciprocalOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<RoundOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<SqrtOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<RsqrtOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<SquareOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<TanhOp>>(patterns.getContext());
+  patterns.add<NamedToElementwisePattern<ErfOp>>(patterns.getContext());
+}
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 762cc88..2cdd502 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -3205,6 +3205,23 @@ llvm::AtomicRMWInst::BinOp convertBinOpToAtomic(Operation &op) {
       .Default(llvm::AtomicRMWInst::BinOp::BAD_BINOP);
 }
 
+void extractAtomicControlFlags(omp::AtomicUpdateOp atomicUpdateOp,
+                               bool &isIgnoreDenormalMode,
+                               bool &isFineGrainedMemory,
+                               bool &isRemoteMemory) {
+  isIgnoreDenormalMode = false;
+  isFineGrainedMemory = false;
+  isRemoteMemory = false;
+  if (atomicUpdateOp &&
+      atomicUpdateOp->hasAttr(atomicUpdateOp.getAtomicControlAttrName())) {
+    mlir::omp::AtomicControlAttr atomicControlAttr =
+        atomicUpdateOp.getAtomicControlAttr();
+    isIgnoreDenormalMode = atomicControlAttr.getIgnoreDenormalMode();
+    isFineGrainedMemory = atomicControlAttr.getFineGrainedMemory();
+    isRemoteMemory = atomicControlAttr.getRemoteMemory();
+  }
+}
+
 /// Converts an OpenMP atomic update operation using OpenMPIRBuilder.
 static LogicalResult
 convertOmpAtomicUpdate(omp::AtomicUpdateOp &opInst,
@@ -3269,13 +3286,19 @@ convertOmpAtomicUpdate(omp::AtomicUpdateOp &opInst,
     return moduleTranslation.lookupValue(yieldop.getResults()[0]);
   };
 
+  bool isIgnoreDenormalMode;
+  bool isFineGrainedMemory;
+  bool isRemoteMemory;
+  extractAtomicControlFlags(opInst, isIgnoreDenormalMode, isFineGrainedMemory,
+                            isRemoteMemory);
   // Handle ambiguous alloca, if any.
   auto allocaIP = findAllocaInsertPoint(builder, moduleTranslation);
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
       ompBuilder->createAtomicUpdate(ompLoc, allocaIP, llvmAtomicX, llvmExpr,
                                      atomicOrdering, binop, updateFn,
-                                     isXBinopExpr);
+                                     isXBinopExpr, isIgnoreDenormalMode,
+                                     isFineGrainedMemory, isRemoteMemory);
 
   if (failed(handleError(afterIP, *opInst)))
     return failure();
@@ -3364,13 +3387,19 @@ convertOmpAtomicCapture(omp::AtomicCaptureOp atomicCaptureOp,
     return moduleTranslation.lookupValue(yieldop.getResults()[0]);
   };
 
+  bool isIgnoreDenormalMode;
+  bool isFineGrainedMemory;
+  bool isRemoteMemory;
+  extractAtomicControlFlags(atomicUpdateOp, isIgnoreDenormalMode,
+                            isFineGrainedMemory, isRemoteMemory);
   // Handle ambiguous alloca, if any.
   auto allocaIP = findAllocaInsertPoint(builder, moduleTranslation);
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
       ompBuilder->createAtomicCapture(
           ompLoc, allocaIP, llvmAtomicX, llvmAtomicV, llvmExpr, atomicOrdering,
-          binop, updateFn, atomicUpdateOp, isPostfixUpdate, isXBinopExpr);
+          binop, updateFn, atomicUpdateOp, isPostfixUpdate, isXBinopExpr,
+          isIgnoreDenormalMode, isFineGrainedMemory, isRemoteMemory);
 
   if (failed(handleError(afterIP, *atomicCaptureOp)))
     return failure();
diff --git a/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir b/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir
index 2a7be0b..e6321e9 100644
--- a/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir
+++ b/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir
@@ -85,6 +85,28 @@ func.func @load_i1(%src: memref<4xi1, #spirv.storage_class<StorageBuffer>>, %i :
   return %0: i1
 }
 
+// CHECK-LABEL: func @load_aligned
+//  CHECK-SAME: (%[[SRC:.+]]: memref<4xi1, #spirv.storage_class<StorageBuffer>>, %[[IDX:.+]]: index)
+func.func @load_aligned(%src: memref<4xi1, #spirv.storage_class<StorageBuffer>>, %i : index) -> i1 {
+  // CHECK: spirv.Load "StorageBuffer" {{.*}} ["Aligned", 32] : i8
+  %0 = memref.load %src[%i] { alignment = 32 } : memref<4xi1, #spirv.storage_class<StorageBuffer>>
+  return %0: i1
+}
+
+// CHECK-LABEL: func @load_aligned_nontemporal
+func.func @load_aligned_nontemporal(%src: memref<4xi1, #spirv.storage_class<StorageBuffer>>, %i : index) -> i1 {
+  // CHECK: spirv.Load "StorageBuffer" {{.*}} ["Aligned|Nontemporal", 32] : i8
+  %0 = memref.load %src[%i] { alignment = 32, nontemporal = true } : memref<4xi1, #spirv.storage_class<StorageBuffer>>
+  return %0: i1
+}
+
+// CHECK-LABEL: func @load_aligned_psb
+func.func @load_aligned_psb(%src: memref<4xi1, #spirv.storage_class<PhysicalStorageBuffer>>, %i : index) -> i1 {
+  // CHECK: %[[VAL:.+]] = spirv.Load "PhysicalStorageBuffer" {{.*}} ["Aligned", 32] : i8
+  %0 = memref.load %src[%i] { alignment = 32 } : memref<4xi1, #spirv.storage_class<PhysicalStorageBuffer>>
+  return %0: i1
+}
+
 // CHECK-LABEL: func @store_i1
 //  CHECK-SAME: %[[DST:.+]]: memref<4xi1, #spirv.storage_class<StorageBuffer>>,
 //  CHECK-SAME: %[[IDX:.+]]: index
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 3d5a46d..78f6782 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -654,7 +654,7 @@ func.func @signExtendConstant() -> i16 {
 //       CHECK:   return %[[cres]]
 func.func @signExtendConstantSplat() -> vector<4xi16> {
   %c-2 = arith.constant -2 : i8
-  %splat = vector.splat %c-2 : vector<4xi8>
+  %splat = vector.broadcast %c-2 : i8 to vector<4xi8>
   %ext = arith.extsi %splat : vector<4xi8> to vector<4xi16>
   return %ext : vector<4xi16>
 }
@@ -682,7 +682,7 @@ func.func @unsignedExtendConstant() -> i16 {
 //       CHECK:   return %[[cres]]
 func.func @unsignedExtendConstantSplat() -> vector<4xi16> {
   %c2 = arith.constant 2 : i8
-  %splat = vector.splat %c2 : vector<4xi8>
+  %splat = vector.broadcast %c2 : i8 to vector<4xi8>
   %ext = arith.extui %splat : vector<4xi8> to vector<4xi16>
   return %ext : vector<4xi16>
 }
@@ -866,7 +866,7 @@ func.func @truncExtsiVector(%arg0: vector<2xi32>) -> vector<2xi16> {
 //       CHECK:   return %[[cres]]
 func.func @truncConstantSplat() -> vector<4xi8> {
   %c-2 = arith.constant -2 : i16
-  %splat = vector.splat %c-2 : vector<4xi16>
+  %splat = vector.broadcast %c-2 : i16 to vector<4xi16>
   %trunc = arith.trunci %splat : vector<4xi16> to vector<4xi8>
   return %trunc : vector<4xi8>
 }
@@ -2334,7 +2334,7 @@ func.func @constant_FPtoUI_splat() -> vector<4xi32> {
   // CHECK: %[[C0:.+]] = arith.constant dense<2> : vector<4xi32>
   // CHECK: return %[[C0]]
   %c0 = arith.constant 2.0 : f32
-  %splat = vector.splat %c0 : vector<4xf32>
+  %splat = vector.broadcast %c0 : f32 to vector<4xf32>
   %res = arith.fptoui %splat : vector<4xf32> to vector<4xi32>
   return %res : vector<4xi32>
 }
@@ -2374,7 +2374,7 @@ func.func @constant_FPtoSI_splat() -> vector<4xi32> {
   // CHECK: %[[C0:.+]] = arith.constant dense<-2> : vector<4xi32>
   // CHECK: return %[[C0]]
   %c0 = arith.constant -2.0 : f32
-  %splat = vector.splat %c0 : vector<4xf32>
+  %splat = vector.broadcast %c0 : f32 to vector<4xf32>
   %res = arith.fptosi %splat : vector<4xf32> to vector<4xi32>
   return %res : vector<4xi32>
 }
@@ -2413,7 +2413,7 @@ func.func @constant_SItoFP_splat() -> vector<4xf32> {
   // CHECK: %[[C0:.+]] = arith.constant dense<2.000000e+00> : vector<4xf32>
   // CHECK: return %[[C0]]
   %c0 = arith.constant 2 : i32
-  %splat = vector.splat %c0 : vector<4xi32>
+  %splat = vector.broadcast %c0 : i32 to vector<4xi32>
   %res = arith.sitofp %splat : vector<4xi32> to vector<4xf32>
   return %res : vector<4xf32>
 }
@@ -2442,7 +2442,7 @@ func.func @constant_UItoFP_splat() -> vector<4xf32> {
   // CHECK: %[[C0:.+]] = arith.constant dense<2.000000e+00> : vector<4xf32>
   // CHECK: return %[[C0]]
   %c0 = arith.constant 2 : i32
-  %splat = vector.splat %c0 : vector<4xi32>
+  %splat = vector.broadcast %c0 : i32 to vector<4xi32>
   %res = arith.uitofp %splat : vector<4xi32> to vector<4xf32>
   return %res : vector<4xf32>
 }
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index a2b2f84..db5271c 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -981,6 +981,13 @@ llvm.func @rocdl.s.wait.expcnt() {
 
 // -----
 
+llvm.func @rocdl.readfirstlane(%src : f32) -> f32 {
+  // CHECK-LABEL: rocdl.readfirstlane
+  // CHECK: rocdl.readfirstlane %{{.*}} : f32
+  %ret = rocdl.readfirstlane %src : f32
+  llvm.return %ret : f32
+}
+
 llvm.func @rocdl.readlane(%src : f32) -> f32 {
   %cst0 = llvm.mlir.constant(0 : i32) : i32
 
diff --git a/mlir/test/Dialect/Linalg/elementwise/named-to-elementwise.mlir b/mlir/test/Dialect/Linalg/elementwise/named-to-elementwise.mlir
new file mode 100644
index 0000000..2332b28
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/elementwise/named-to-elementwise.mlir
@@ -0,0 +1,56 @@
+// RUN: mlir-opt %s -linalg-morph-ops=named-to-category -split-input-file | FileCheck %s
+
+// CHECK: @exp(%[[A:.+]]: tensor<16x8xf32>, %[[B:.+]]: tensor<16x8xf32>) ->  tensor<16x8xf32> {
+// CHECK: {{.*}} = linalg.elementwise
+// CHECK-SAME:       kind=#linalg.elementwise_kind<exp>
+// CHECK-SAME:       ins(%[[A]] : tensor<16x8xf32>)
+// CHECK-SAME:       outs(%[[B]] : tensor<16x8xf32>) -> tensor<16x8xf32>
+//
+func.func @exp(%A : tensor<16x8xf32>, %B : tensor<16x8xf32>) ->  tensor<16x8xf32> {
+  %exp = linalg.exp ins(%A : tensor<16x8xf32>) outs(%B :  tensor<16x8xf32>) -> tensor<16x8xf32>
+  return %exp :  tensor<16x8xf32>
+}
+
+// ----
+
+// CHECK: @add(%[[A:.+]]: tensor<16x8xf32>, %[[B:.+]]: tensor<16x8xf32>, %[[C:.+]]: tensor<16x8xf32>) ->  tensor<16x8xf32> {
+// CHECK: {{.*}} = linalg.elementwise
+// CHECK-SAME:       kind=#linalg.elementwise_kind<add>
+// CHECK-SAME:       ins(%[[A]], %[[B]] : tensor<16x8xf32>, tensor<16x8xf32>)
+// CHECK-SAME:       outs(%[[C]] : tensor<16x8xf32>) -> tensor<16x8xf32>
+//
+func.func @add(%A : tensor<16x8xf32>, %B: tensor<16x8xf32>, %C : tensor<16x8xf32>) ->  tensor<16x8xf32> {
+  %add = linalg.add ins(%A, %B : tensor<16x8xf32>, tensor<16x8xf32>) outs(%C :  tensor<16x8xf32>) -> tensor<16x8xf32>
+  return %add :  tensor<16x8xf32>
+}
+
+// ----
+
+// CHECK: @sub(%[[A:.+]]: tensor<16x8xf32>, %[[B:.+]]: tensor<16x8xf32>, %[[C:.+]]: tensor<16x8xf32>) -> tensor<16x8xf32> {
+// CHECK: {{.*}} = linalg.elementwise
+// CHECK-SAME:       kind=#linalg.elementwise_kind<sub>
+// CHECK-SAME:       ins(%[[A]], %[[B]] : tensor<16x8xf32>, tensor<16x8xf32>)
+// CHECK-SAME:       outs(%[[C]] : tensor<16x8xf32>)
+//
+func.func @sub(%A : tensor<16x8xf32>, %B: tensor<16x8xf32>, %C : tensor<16x8xf32>) -> tensor<16x8xf32> {
+  %sub = linalg.sub ins(%A, %B : tensor<16x8xf32>, tensor<16x8xf32>) outs(%C :  tensor<16x8xf32>) -> tensor<16x8xf32>
+  return %sub : tensor<16x8xf32>
+}
+
+// ----
+
+// CHECK: @ternary_select(%[[A:.+]]: tensor<4x8x16xi1>, %[[B:.+]]: tensor<4x8x16xf32>, %[[C:.+]]: tensor<4x8x16xf32>)
+// CHECK:   %[[E:.+]] =  tensor.empty() : tensor<4x8x16xf32>
+// CHECK: {{.*}} = linalg.elementwise
+// CHECK-SAME:       kind=#linalg.elementwise_kind<select>
+// CHECK-SAME:       ins(%[[A]], %[[B]], %[[C]] : tensor<4x8x16xi1>, tensor<4x8x16xf32>, tensor<4x8x16xf32>)
+// CHECK-SAME:       outs(%[[E]] : tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+//
+func.func @ternary_select(%A: tensor<4x8x16xi1>, %B: tensor<4x8x16xf32>, %C: tensor<4x8x16xf32>)
+             -> tensor<4x8x16xf32> {
+  %empty = tensor.empty() : tensor<4x8x16xf32>
+  %select = linalg.select
+              ins(%A, %B, %C : tensor<4x8x16xi1>, tensor<4x8x16xf32>, tensor<4x8x16xf32>)
+              outs(%empty: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %select : tensor<4x8x16xf32>
+}
diff --git a/mlir/test/Dialect/Linalg/linalg-morph-category-ops.mlir b/mlir/test/Dialect/Linalg/linalg-morph-category-ops.mlir
new file mode 100644
index 0000000..00602c4
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/linalg-morph-category-ops.mlir
@@ -0,0 +1,15 @@
+// Forward path `named -> category -> generic`
+// RUN: mlir-opt %s -linalg-morph-ops=named-to-category | FileCheck %s  --check-prefix=NAMED_TO_CATEGORY
+
+// RUN: mlir-opt %s -linalg-morph-ops=named-to-category |  \
+// RUN:   mlir-opt %s -linalg-morph-ops=category-to-generic | FileCheck %s  --check-prefix=CATEGORY_TO_GENERIC
+
+func.func @exp(%A : tensor<16x8xf32>, %B : tensor<16x8xf32>) ->  tensor<16x8xf32> {
+  %exp = linalg.exp ins(%A : tensor<16x8xf32>) outs(%B :  tensor<16x8xf32>) -> tensor<16x8xf32>
+  return %exp :  tensor<16x8xf32>
+}
+// NAMED_TO_CATEGORY: linalg.elementwise
+// NAMED_TO_CATEGORY-NOT: linalg.exp
+
+// CATEGORY_TO_GENERIC: linalg.generic
+// CATEGORY_TO_GENERIC-NOT: linalg.elementwise
diff --git a/mlir/test/Dialect/Linalg/linalg-morph-multi-step.mlir b/mlir/test/Dialect/Linalg/linalg-morph-multi-step.mlir
new file mode 100644
index 0000000..ab50a44
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/linalg-morph-multi-step.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-opt %s -linalg-morph-ops=named-to-generic |  FileCheck %s  --check-prefix=NAMED_TO_GENERIC
+// RUN: mlir-opt %s -linalg-morph-ops=named-to-generic |  mlir-opt %s -linalg-morph-ops=generic-to-named | \
+// RUN:   FileCheck %s  --check-prefix=ROUND_TRIP
+
+func.func @exp(%A : tensor<16x8xf32>, %B : tensor<16x8xf32>) ->  tensor<16x8xf32> {
+  %exp = linalg.exp ins(%A : tensor<16x8xf32>) outs(%B :  tensor<16x8xf32>) -> tensor<16x8xf32>
+  return %exp :  tensor<16x8xf32>
+}
+
+// NAMED_TO_GENERIC: linalg.generic
+// NAMED_TO_GENERIC-NOT: linalg.exp
+
+// ROUND_TRIP: linalg.exp
+// ROUND_TRIP-NOT: linalg.generic
diff --git a/mlir/test/Dialect/Vector/vector-sink.mlir b/mlir/test/Dialect/Vector/vector-sink.mlir
index ef881ba..577b06d 100644
--- a/mlir/test/Dialect/Vector/vector-sink.mlir
+++ b/mlir/test/Dialect/Vector/vector-sink.mlir
@@ -40,7 +40,7 @@ func.func @broadcast_scalar_with_bcast_scalable(%arg1: index, %arg2: index) -> v
 // CHECK:           %[[BCAST:.*]] = vector.broadcast %[[ADD]] : index to vector<1x4xindex>
 // CHECK:           return %[[BCAST]] : vector<1x4xindex>
 func.func @broadcast_scalar_with_bcast_and_splat(%arg1: index, %arg2: index) -> vector<1x4xindex> {
-  %0 = vector.splat %arg1 : vector<1x4xindex>
+  %0 = vector.broadcast %arg1 : index to vector<1x4xindex>
   %1 = vector.broadcast %arg2 : index to vector<1x4xindex>
   %2 = arith.addi %0, %1 : vector<1x4xindex>
   return %2 : vector<1x4xindex>
@@ -53,7 +53,7 @@ func.func @broadcast_scalar_with_bcast_and_splat(%arg1: index, %arg2: index) ->
 // CHECK:           %[[BCAST:.*]] = vector.broadcast %[[ADD]] : index to vector<1x[4]xindex>
 // CHECK:           return %[[BCAST]] : vector<1x[4]xindex>
 func.func @broadcast_scalar_with_bcast_and_splat_scalable(%arg1: index, %arg2: index) -> vector<1x[4]xindex> {
-  %0 = vector.splat %arg1 : vector<1x[4]xindex>
+  %0 = vector.broadcast %arg1 : index to vector<1x[4]xindex>
   %1 = vector.broadcast %arg2 : index to vector<1x[4]xindex>
   %2 = arith.addi %0, %1 : vector<1x[4]xindex>
   return %2 : vector<1x[4]xindex>
@@ -94,12 +94,12 @@ func.func @broadcast_vector_scalable(%arg1: vector<[4]xf32>, %arg2: vector<[4]xf
 // CHECK-LABEL:   func.func @broadcast_scalar_and_vec(
 // CHECK-SAME:       %[[ARG1:.*]]: index,
 // CHECK-SAME:       %[[ARG2:.*]]: vector<4xindex>) -> vector<1x4xindex> {
-// CHECK:            %[[SPLAT:.*]] = vector.splat %[[ARG1]] : vector<1x4xindex>
+// CHECK:            %[[SPLAT:.*]] = vector.broadcast %[[ARG1]] : index to vector<1x4xindex>
 // CHECK:            %[[BCAST:.*]] = vector.broadcast %[[ARG2]] : vector<4xindex> to vector<1x4xindex>
 // CHECK:            %[[ADD:.*]] = arith.addi %[[SPLAT]], %[[BCAST]] : vector<1x4xindex>
 // CHECK:            return %[[ADD]] : vector<1x4xindex>
 func.func @broadcast_scalar_and_vec(%arg1: index, %arg2: vector<4xindex>) -> vector<1x4xindex> {
-  %0 = vector.splat %arg1 : vector<1x4xindex>
+  %0 = vector.broadcast %arg1 : index to vector<1x4xindex>
   %1 = vector.broadcast %arg2 : vector<4xindex> to vector<1x4xindex>
   %2 = arith.addi %0, %1 : vector<1x4xindex>
   return %2 : vector<1x4xindex>
@@ -108,12 +108,12 @@ func.func @broadcast_scalar_and_vec(%arg1: index, %arg2: vector<4xindex>) -> vec
 // CHECK-LABEL:   func.func @broadcast_scalar_and_vec_scalable(
 // CHECK-SAME:       %[[ARG1:.*]]: index,
 // CHECK-SAME:       %[[ARG2:.*]]: vector<[4]xindex>) -> vector<1x[4]xindex> {
-// CHECK:            %[[SPLAT:.*]] = vector.splat %[[ARG1]] : vector<1x[4]xindex>
+// CHECK:            %[[SPLAT:.*]] = vector.broadcast %[[ARG1]] : index to vector<1x[4]xindex>
 // CHECK:            %[[BCAST:.*]] = vector.broadcast %[[ARG2]] : vector<[4]xindex> to vector<1x[4]xindex>
 // CHECK:            %[[ADD:.*]] = arith.addi %[[SPLAT]], %[[BCAST]] : vector<1x[4]xindex>
 // CHECK:            return %[[ADD]] : vector<1x[4]xindex>
 func.func @broadcast_scalar_and_vec_scalable(%arg1: index, %arg2: vector<[4]xindex>) -> vector<1x[4]xindex> {
-  %0 = vector.splat %arg1 : vector<1x[4]xindex>
+  %0 = vector.broadcast %arg1 : index to vector<1x[4]xindex>
   %1 = vector.broadcast %arg2 : vector<[4]xindex> to vector<1x[4]xindex>
   %2 = arith.addi %0, %1 : vector<1x[4]xindex>
   return %2 : vector<1x[4]xindex>
@@ -787,7 +787,7 @@ func.func @negative_extract_load_scalable(%arg0: memref<?xf32>, %arg1: index) ->
 //  CHECK-SAME:   (%[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: f32)
 func.func @store_splat(%arg0: memref<?xf32>, %arg1: index, %arg2: f32) {
 // CHECK:   memref.store %[[ARG2]], %[[ARG0]][%[[ARG1]]] : memref<?xf32>
-  %0 = vector.splat %arg2 : vector<1xf32>
+  %0 = vector.broadcast %arg2 : f32 to vector<1xf32>
   vector.store %0, %arg0[%arg1] : memref<?xf32>, vector<1xf32>
   return
 }
@@ -813,9 +813,9 @@ func.func @store_broadcast_1d_to_2d(%arg0: memref<?x?xf32>, %arg1: index, %arg2:
 // CHECK-LABEL: @negative_store_scalable
 //  CHECK-SAME:   (%[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: f32)
 func.func @negative_store_scalable(%arg0: memref<?xf32>, %arg1: index, %arg2: f32) {
-// CHECK:   %[[RES:.*]] = vector.splat %[[ARG2]] : vector<[1]xf32>
+// CHECK:   %[[RES:.*]] = vector.broadcast %[[ARG2]] : f32 to vector<[1]xf32>
 // CHECK:   vector.store %[[RES]], %[[ARG0]][%[[ARG1]]] : memref<?xf32>, vector<[1]xf32>
-  %0 = vector.splat %arg2 : vector<[1]xf32>
+  %0 = vector.broadcast %arg2 : f32 to vector<[1]xf32>
   vector.store %0, %arg0[%arg1] : memref<?xf32>, vector<[1]xf32>
   return
 }
@@ -823,9 +823,9 @@ func.func @negative_store_scalable(%arg0: memref<?xf32>, %arg1: index, %arg2: f3
 // CHECK-LABEL: @negative_store_memref_of_vec
 //  CHECK-SAME:   (%[[ARG0:.*]]: memref<?xvector<1xf32>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: f32)
 func.func @negative_store_memref_of_vec(%arg0: memref<?xvector<1xf32>>, %arg1: index, %arg2: f32) {
-// CHECK:   %[[RES:.*]] = vector.splat %[[ARG2]] : vector<1xf32>
+// CHECK:   %[[RES:.*]] = vector.broadcast %[[ARG2]] : f32 to vector<1xf32>
 // CHECK:   vector.store %[[RES]], %[[ARG0]][%[[ARG1]]] : memref<?xvector<1xf32>>, vector<1xf32>
-  %0 = vector.splat %arg2 : vector<1xf32>
+  %0 = vector.broadcast %arg2 : f32 to vector<1xf32>
   vector.store %0, %arg0[%arg1] : memref<?xvector<1xf32>>, vector<1xf32>
   return
 }
@@ -833,9 +833,9 @@ func.func @negative_store_memref_of_vec(%arg0: memref<?xvector<1xf32>>, %arg1: i
 // CHECK-LABEL: @negative_store_more_than_one_element
 //  CHECK-SAME:   (%[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: f32)
 func.func @negative_store_more_than_one_element(%arg0: memref<?xf32>, %arg1: index, %arg2: f32) {
-// CHECK:   %[[RES:.*]] = vector.splat %[[ARG2]] : vector<4xf32>
+// CHECK:   %[[RES:.*]] = vector.broadcast %[[ARG2]] : f32 to vector<4xf32>
 // CHECK:   vector.store %[[RES]], %[[ARG0]][%[[ARG1]]] : memref<?xf32>, vector<4xf32>
-  %0 = vector.splat %arg2 : vector<4xf32>
+  %0 = vector.broadcast %arg2 : f32 to vector<4xf32>
   vector.store %0, %arg0[%arg1] : memref<?xf32>, vector<4xf32>
   return
 }
@@ -843,10 +843,10 @@ func.func @negative_store_more_than_one_element(%arg0: memref<?xf32>, %arg1: ind
 // CHECK-LABEL: @negative_store_no_single_use
 //  CHECK-SAME:   (%[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: f32)
 func.func @negative_store_no_single_use(%arg0: memref<?xf32>, %arg1: index, %arg2: f32) -> vector<1xf32> {
-// CHECK:   %[[RES:.*]] = vector.splat %[[ARG2]] : vector<1xf32>
+// CHECK:   %[[RES:.*]] = vector.broadcast %[[ARG2]] : f32 to vector<1xf32>
 // CHECK:   vector.store %[[RES]], %[[ARG0]][%[[ARG1]]] : memref<?xf32>, vector<1xf32>
 // CHECK:   return %[[RES:.*]] : vector<1xf32>
-  %0 = vector.splat %arg2 : vector<1xf32>
+  %0 = vector.broadcast %arg2 : f32 to vector<1xf32>
   vector.store %0, %arg0[%arg1] : memref<?xf32>, vector<1xf32>
   return %0 : vector<1xf32>
 }
diff --git a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir
index 1b54d54..45afbff 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir
@@ -285,19 +285,19 @@ func.func @transfer_read_permutations(%mem_0 : memref<?x?xf32>, %mem_1 : memref<
   %c0 = arith.constant 0 : index
 
 // CHECK: %[[MASK0:.*]] = vector.broadcast %{{.*}} : i1 to vector<14x7xi1>
-  %mask0 = vector.splat %m : vector<14x7xi1>
+  %mask0 = vector.broadcast %m : i1 to vector<14x7xi1>
   %0 = vector.transfer_read %mem_1[%c0, %c0, %c0, %c0], %cst, %mask0 {in_bounds = [true, false, true, true], permutation_map = #map0} : memref<?x?x?x?xf32>, vector<7x14x8x16xf32>
 // CHECK: vector.transfer_read {{.*}} %[[MASK0]] {in_bounds = [false, true, true, true], permutation_map = #[[$MAP0]]} : memref<?x?x?x?xf32>, vector<14x7x8x16xf32>
 // CHECK: vector.transpose %{{.*}}, [1, 0, 2, 3] : vector<14x7x8x16xf32> to vector<7x14x8x16xf32>
 
 // CHECK: %[[MASK1:.*]] = vector.broadcast %{{.*}} : i1 to vector<16x14xi1>
-  %mask1 = vector.splat %m : vector<16x14xi1>
+  %mask1 = vector.broadcast %m : i1 to vector<16x14xi1>
   %1 = vector.transfer_read %mem_1[%c0, %c0, %c0, %c0], %cst, %mask1 {in_bounds = [true, false, true, false], permutation_map = #map1} : memref<?x?x?x?xf32>, vector<7x14x8x16xf32>
 // CHECK: vector.transfer_read {{.*}} %[[MASK1]] {in_bounds = [false, false, true, true], permutation_map = #[[$MAP0]]} : memref<?x?x?x?xf32>, vector<16x14x7x8xf32>
 // CHECK: vector.transpose %{{.*}}, [2, 1, 3, 0] : vector<16x14x7x8xf32> to vector<7x14x8x16xf32>
 
 // CHECK: %[[MASK3:.*]] = vector.broadcast %{{.*}} : i1 to vector<14x7xi1>
-  %mask2 = vector.splat %m : vector<14x7xi1>
+  %mask2 = vector.broadcast %m : i1 to vector<14x7xi1>
   %2 = vector.transfer_read %mem_1[%c0, %c0, %c0, %c0], %cst, %mask2 {in_bounds = [true, false, true, true], permutation_map = #map2} : memref<?x?x?x?xf32>, vector<7x14x8x16xf32>
 // CHECK: vector.transfer_read {{.*}} %[[MASK3]] {in_bounds = [false, true, true], permutation_map = #[[$MAP1]]} : memref<?x?x?x?xf32>, vector<14x16x7xf32>
 // CHECK: vector.broadcast %{{.*}} : vector<14x16x7xf32> to vector<8x14x16x7xf32>
@@ -337,7 +337,7 @@ func.func @transfer_write_permutations_tensor_masked(
   %c0 = arith.constant 0 : index
 
   // CHECK: %[[MASK:.*]] = vector.broadcast %[[M]] : i1 to vector<16x14x7x8xi1>
-  %mask0 = vector.splat %m : vector<16x14x7x8xi1>
+  %mask0 = vector.broadcast %m : i1 to vector<16x14x7x8xi1>
   %res = vector.transfer_write %vec, %dst[%c0, %c0, %c0, %c0], %mask0 {in_bounds = [true, false, false, true], permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d1, d3, d0)>} : vector<7x14x8x16xf32>, tensor<?x?x?x?xf32>
   // CHECK: %[[NEW_VEC0:.*]] = vector.transpose %{{.*}} [3, 1, 0, 2] : vector<7x14x8x16xf32> to vector<16x14x7x8xf32>
   // CHECK: %[[NEW_RES0:.*]] = vector.transfer_write %[[NEW_VEC0]], %[[DST]][%c0, %c0, %c0, %c0], %[[MASK]] {in_bounds = [true, false, true, false]} : vector<16x14x7x8xf32>, tensor<?x?x?x?xf32>
diff --git a/mlir/test/Dialect/common_folders.mlir b/mlir/test/Dialect/common_folders.mlir
new file mode 100644
index 0000000..92598b4
--- /dev/null
+++ b/mlir/test/Dialect/common_folders.mlir
@@ -0,0 +1,22 @@
+// RUN: mlir-opt %s --test-fold-type-converting-op --split-input-file | FileCheck %s
+
+// CHECK-LABEL: @test_fold_unary_op_f32_to_si32(
+func.func @test_fold_unary_op_f32_to_si32() -> tensor<4x2xsi32> {
+  // CHECK-NEXT: %[[POSITIVE_ONE:.*]] = arith.constant dense<1> : tensor<4x2xsi32>
+  // CHECK-NEXT: return %[[POSITIVE_ONE]] : tensor<4x2xsi32>
+  %operand = arith.constant dense<5.1> : tensor<4x2xf32>
+  %sign = test.sign %operand : (tensor<4x2xf32>) -> tensor<4x2xsi32>
+  return %sign : tensor<4x2xsi32>
+}
+
+// -----
+
+// CHECK-LABEL: @test_fold_binary_op_f32_to_i1(
+func.func @test_fold_binary_op_f32_to_i1() -> tensor<8xi1> {
+  // CHECK-NEXT: %[[FALSE:.*]] = arith.constant dense<false> : tensor<8xi1>
+  // CHECK-NEXT: return %[[FALSE]] : tensor<8xi1>
+  %lhs = arith.constant dense<5.1> : tensor<8xf32>
+  %rhs = arith.constant dense<4.2> : tensor<8xf32>
+  %less_than = test.less_than %lhs, %rhs : (tensor<8xf32>, tensor<8xf32>) -> tensor<8xi1>
+  return %less_than : tensor<8xi1>
+}
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir
index 0ee0166..219367a 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir
@@ -46,7 +46,7 @@ func.func @test_outerproduct_with_accumulator_4x4xf32() {
   %c0 = arith.constant 0 : index
   %f10 = arith.constant 10.0 : f32
 
-  %acc = vector.splat %f10 : vector<[4]x[4]xf32>
+  %acc = vector.broadcast %f10 : f32 to vector<[4]x[4]xf32>
   %vector_i32 = llvm.intr.stepvector : vector<[4]xi32>
   %vector = arith.sitofp %vector_i32 : vector<[4]xi32> to vector<[4]xf32>
   %tile = vector.outerproduct %vector, %vector, %acc : vector<[4]xf32>, vector<[4]xf32>
@@ -103,7 +103,7 @@ func.func @test_masked_outerproduct_with_accumulator_4x4xf32() {
   %ones = arith.constant dense<1> : vector<[4]xi32>
   %f10 = arith.constant 10.0 : f32
 
-  %acc = vector.splat %f10 : vector<[4]x[4]xf32>
+  %acc = vector.broadcast %f10 : f32 to vector<[4]x[4]xf32>
   %step_vector = llvm.intr.stepvector : vector<[4]xi32>
   %vector_i32 = arith.addi %step_vector, %ones : vector<[4]xi32>
   %vector = arith.sitofp %vector_i32 : vector<[4]xi32> to vector<[4]xf32>
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f64.mlir
index 8e81210..059f24a 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f64.mlir
@@ -52,7 +52,7 @@ func.func @test_outerproduct_with_accumulator_2x2xf64() {
   %ones = arith.constant dense<1> : vector<[2]xi32>
   %f10 = arith.constant 10.0 : f64
 
-  %acc = vector.splat %f10 : vector<[2]x[2]xf64>
+  %acc = vector.broadcast %f10 : f64 to vector<[2]x[2]xf64>
   %step_vector = llvm.intr.stepvector : vector<[2]xi32>
   %vector_i32 = arith.addi %step_vector, %ones : vector<[2]xi32>
   %vector = arith.sitofp %vector_i32 : vector<[2]xi32> to vector<[2]xf64>
@@ -108,7 +108,7 @@ func.func @test_masked_outerproduct_with_accumulator_2x2xf64() {
   %ones = arith.constant dense<1> : vector<[2]xi32>
   %f10 = arith.constant 10.0 : f64
 
-  %acc = vector.splat %f10 : vector<[2]x[2]xf64>
+  %acc = vector.broadcast %f10 : f64 to vector<[2]x[2]xf64>
   %step_vector = llvm.intr.stepvector : vector<[2]xi32>
   %vector_i32 = arith.addi %step_vector, %ones : vector<[2]xi32>
   %vector = arith.sitofp %vector_i32 : vector<[2]xi32> to vector<[2]xf64>
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/transfer-write-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/transfer-write-2d.mlir
index c3bf379..bf6900c 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/transfer-write-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/transfer-write-2d.mlir
@@ -10,7 +10,7 @@
 // Vector store.
 func.func @transfer_write_2d(%A : memref<?x?xf32>, %base1: index, %base2: index) {
   %c0 = arith.constant 0.0 : f32
-  %zero = vector.splat %c0 : vector<[4]x[4]xf32>
+  %zero = vector.broadcast %c0 : f32 to vector<[4]x[4]xf32>
   vector.transfer_write %zero, %A[%base1, %base2] {in_bounds=[true, true]} :
     vector<[4]x[4]xf32>, memref<?x?xf32>
   return
@@ -22,7 +22,7 @@ func.func @transfer_write_2d_mask(%A : memref<?x?xf32>, %base1: index, %base2: i
   %c2 = arith.constant 2 : index
   %c3 = arith.constant 3 : index
   %mask = vector.create_mask %c2, %c3 : vector<[4]x[4]xi1>
-  %zero = vector.splat %c0 : vector<[4]x[4]xf32>
+  %zero = vector.broadcast %c0 : f32 to vector<[4]x[4]xf32>
   vector.transfer_write %zero, %A[%base1, %base2], %mask {in_bounds=[true, true]} :
     vector<[4]x[4]xf32>, memref<?x?xf32>
   return
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction.mlir
index c990432..192f291 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction.mlir
@@ -106,7 +106,7 @@ func.func @matvec_i32() {
   //    val = (123 * 314) * 4 * vscale
   // so ...
   %vscale = vector.vscale
-  %vscale_v = vector.splat %vscale : vector<3xindex>
+  %vscale_v = vector.broadcast %vscale : index to vector<3xindex>
   %vscale_i32 = arith.index_cast %vscale_v : vector<3xindex> to vector<3xi32>
   %mv1_div = arith.divui %mv1, %vscale_i32 : vector<3xi32>
   // ... val / vscale = 123 * 314 * 4 = 154488
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/scalable-interleave.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/scalable-interleave.mlir
index d3b1fa4..2d8180a 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/scalable-interleave.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/scalable-interleave.mlir
@@ -7,8 +7,8 @@
 func.func @entry() {
   %f1 = arith.constant 1.0 : f32
   %f2 = arith.constant 2.0 : f32
-  %v1 = vector.splat %f1 : vector<[4]xf32>
-  %v2 = vector.splat %f2 : vector<[4]xf32>
+  %v1 = vector.broadcast %f1 : f32 to vector<[4]xf32>
+  %v2 = vector.broadcast %f2 : f32 to vector<[4]xf32>
   vector.print %v1 : vector<[4]xf32>
   vector.print %v2 : vector<[4]xf32>
   //
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/interleave.mlir b/mlir/test/Integration/Dialect/Vector/CPU/interleave.mlir
index f812c25..740c742 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/interleave.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/interleave.mlir
@@ -6,8 +6,8 @@
 func.func @entry() {
   %f1 = arith.constant 1.0 : f32
   %f2 = arith.constant 2.0 : f32
-  %v1 = vector.splat %f1 : vector<2x4xf32>
-  %v2 = vector.splat %f2 : vector<2x4xf32>
+  %v1 = vector.broadcast %f1 : f32 to vector<2x4xf32>
+  %v2 = vector.broadcast %f2 : f32 to vector<2x4xf32>
   vector.print %v1 : vector<2x4xf32>
   vector.print %v2 : vector<2x4xf32>
   //
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir
index f7e2229..e25795a 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir
@@ -14,9 +14,9 @@
 !vector_type_R = vector<7xf32>
 
 func.func @vector_outerproduct_splat_8x8(%fa: f32, %fb: f32, %fc: f32) -> !vector_type_C {
-  %a = vector.splat %fa: !vector_type_A
-  %b = vector.splat %fb: !vector_type_B
-  %c = vector.splat %fc: !vector_type_C
+  %a = vector.broadcast %fa: f32 to !vector_type_A
+  %b = vector.broadcast %fb: f32 to !vector_type_B
+  %c = vector.broadcast %fc: f32 to !vector_type_C
   %d = vector.outerproduct %a, %b, %c : !vector_type_A, !vector_type_B
   return %d: !vector_type_C
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir
index a19dfa1..0675102 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir
@@ -14,9 +14,9 @@
 !vector_type_R = vector<7xi64>
 
 func.func @vector_outerproduct_splat_8x8(%ia: i64, %ib: i64, %ic: i64) -> !vector_type_C {
-  %a = vector.splat %ia: !vector_type_A
-  %b = vector.splat %ib: !vector_type_B
-  %c = vector.splat %ic: !vector_type_C
+  %a = vector.broadcast %ia: i64 to !vector_type_A
+  %b = vector.broadcast %ib: i64 to !vector_type_B
+  %c = vector.broadcast %ic: i64 to !vector_type_C
   %d = vector.outerproduct %a, %b, %c : !vector_type_A, !vector_type_B
   return %d: !vector_type_C
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir
index 639eed4..895b881 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir
@@ -137,7 +137,7 @@ func.func @transfer_read_1d_mask_in_bounds(
 // Non-contiguous, strided store.
 func.func @transfer_write_1d(%A : memref<?x?xf32>, %base1 : index, %base2 : index) {
   %fn1 = arith.constant -1.0 : f32
-  %vf0 = vector.splat %fn1 : vector<7xf32>
+  %vf0 = vector.broadcast %fn1 : f32 to vector<7xf32>
   vector.transfer_write %vf0, %A[%base1, %base2]
     {permutation_map = affine_map<(d0, d1) -> (d0)>}
     : vector<7xf32>, memref<?x?xf32>
@@ -147,7 +147,7 @@ func.func @transfer_write_1d(%A : memref<?x?xf32>, %base1 : index, %base2 : inde
 // Non-contiguous, strided store.
 func.func @transfer_write_1d_mask(%A : memref<?x?xf32>, %base1 : index, %base2 : index) {
   %fn1 = arith.constant -2.0 : f32
-  %vf0 = vector.splat %fn1 : vector<7xf32>
+  %vf0 = vector.broadcast %fn1 : f32 to vector<7xf32>
   %mask = arith.constant dense<[1, 0, 1, 0, 1, 1, 1]> : vector<7xi1>
   vector.transfer_write %vf0, %A[%base1, %base2], %mask
     {permutation_map = affine_map<(d0, d1) -> (d0)>}
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir
index 009c137..80dff9d 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir
@@ -100,7 +100,7 @@ func.func @transfer_read_2d_broadcast(
 // Vector store.
 func.func @transfer_write_2d(%A : memref<?x?xf32>, %base1: index, %base2: index) {
   %fn1 = arith.constant -1.0 : f32
-  %vf0 = vector.splat %fn1 : vector<1x4xf32>
+  %vf0 = vector.broadcast %fn1 : f32 to vector<1x4xf32>
   vector.transfer_write %vf0, %A[%base1, %base2]
     {permutation_map = affine_map<(d0, d1) -> (d0, d1)>} :
     vector<1x4xf32>, memref<?x?xf32>
@@ -111,7 +111,7 @@ func.func @transfer_write_2d(%A : memref<?x?xf32>, %base1: index, %base2: index)
 func.func @transfer_write_2d_mask(%A : memref<?x?xf32>, %base1: index, %base2: index) {
   %fn1 = arith.constant -2.0 : f32
   %mask = arith.constant dense<[[1, 0, 1, 0]]> : vector<1x4xi1>
-  %vf0 = vector.splat %fn1 : vector<1x4xf32>
+  %vf0 = vector.broadcast %fn1 : f32 to vector<1x4xf32>
   vector.transfer_write %vf0, %A[%base1, %base2], %mask
     {permutation_map = affine_map<(d0, d1) -> (d0, d1)>} :
     vector<1x4xf32>, memref<?x?xf32>
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir
index d41d9c9..93e6a12 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir
@@ -62,7 +62,7 @@ func.func @transfer_read_3d_transposed(%A : memref<?x?x?x?xf32>,
 func.func @transfer_write_3d(%A : memref<?x?x?x?xf32>,
                         %o: index, %a: index, %b: index, %c: index) {
   %fn1 = arith.constant -1.0 : f32
-  %vf0 = vector.splat %fn1 : vector<2x9x3xf32>
+  %vf0 = vector.broadcast %fn1 : f32 to vector<2x9x3xf32>
   vector.transfer_write %vf0, %A[%o, %a, %b, %c]
       : vector<2x9x3xf32>, memref<?x?x?x?xf32>
   return
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir
index d1a2790..18084e3 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir
@@ -45,7 +45,7 @@ func.func @transfer_read_mask_inbounds_4(%A : memref<?xf32>, %base: index) {
 
 func.func @transfer_write_1d(%A : memref<?xf32>, %base: index) {
   %f0 = arith.constant 0.0 : f32
-  %vf0 = vector.splat %f0 : vector<4xf32>
+  %vf0 = vector.broadcast %f0 : f32 to vector<4xf32>
   vector.transfer_write %vf0, %A[%base]
       {permutation_map = affine_map<(d0) -> (d0)>} :
     vector<4xf32>, memref<?xf32>
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir
index def7081..2251738 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir
@@ -5,7 +5,7 @@
 
 func.func @transfer_write16_inbounds_1d(%A : memref<?xf32>, %base: index) {
   %f = arith.constant 16.0 : f32
-  %v = vector.splat %f : vector<16xf32>
+  %v = vector.broadcast %f : f32 to vector<16xf32>
   vector.transfer_write %v, %A[%base]
     {permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]}
     : vector<16xf32>, memref<?xf32>
@@ -14,7 +14,7 @@ func.func @transfer_write16_inbounds_1d(%A : memref<?xf32>, %base: index) {
 
 func.func @transfer_write13_1d(%A : memref<?xf32>, %base: index) {
   %f = arith.constant 13.0 : f32
-  %v = vector.splat %f : vector<13xf32>
+  %v = vector.broadcast %f : f32 to vector<13xf32>
   vector.transfer_write %v, %A[%base]
     {permutation_map = affine_map<(d0) -> (d0)>}
     : vector<13xf32>, memref<?xf32>
@@ -23,7 +23,7 @@ func.func @transfer_write13_1d(%A : memref<?xf32>, %base: index) {
 
 func.func @transfer_write17_1d(%A : memref<?xf32>, %base: index) {
   %f = arith.constant 17.0 : f32
-  %v = vector.splat %f : vector<17xf32>
+  %v = vector.broadcast %f : f32 to vector<17xf32>
   vector.transfer_write %v, %A[%base]
     {permutation_map = affine_map<(d0) -> (d0)>}
     : vector<17xf32>, memref<?xf32>
@@ -42,7 +42,7 @@ func.func @transfer_read_1d(%A : memref<?xf32>) -> vector<32xf32> {
 func.func @transfer_write_inbounds_3d(%A : memref<4x4x4xf32>) {
   %c0 = arith.constant 0: index
   %f = arith.constant 0.0 : f32
-  %v0 = vector.splat %f : vector<2x3x4xf32>
+  %v0 = vector.broadcast %f : f32 to vector<2x3x4xf32>
   %f1 = arith.constant 1.0 : f32
   %f2 = arith.constant 2.0 : f32
   %f3 = arith.constant 3.0 : f32
diff --git a/mlir/test/Target/LLVMIR/omptarget-atomic-capture-control-options.mlir b/mlir/test/Target/LLVMIR/omptarget-atomic-capture-control-options.mlir
new file mode 100644
index 0000000..3553907
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-atomic-capture-control-options.mlir
@@ -0,0 +1,44 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// CHECK: atomicrmw add ptr %loadgep_, i32 1 monotonic, align 4, !amdgpu.no.remote.memory !{{.*}}
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, !llvm.ptr<1> = dense<64> : vector<4xi64>, !llvm.ptr<2> = dense<32> : vector<4xi64>, !llvm.ptr<3> = dense<32> : vector<4xi64>, !llvm.ptr<4> = dense<64> : vector<4xi64>, !llvm.ptr<5> = dense<32> : vector<4xi64>, !llvm.ptr<6> = dense<32> : vector<4xi64>, !llvm.ptr<7> = dense<[160, 256, 256, 32]> : vector<4xi64>, !llvm.ptr<8> = dense<[128, 128, 128, 48]> : vector<4xi64>, !llvm.ptr<9> = dense<[192, 256, 256, 32]> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.legal_int_widths" = array<i32: 32, 64>, "dlti.stack_alignment" = 32 : i64, "dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, fir.atomic_fine_grained_memory, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", fir.target_cpu = "generic-hsa", llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9", llvm.target_triple = "amdgcn-amd-amdhsa", omp.flags = #omp.flags<openmp_device_version = 31>, omp.is_gpu = true, omp.is_target_device = true, omp.requires = #omp<clause_requires none>, omp.target_triples = [], omp.version = #omp.version<version = 31>} {
+  llvm.func @_QQmain() attributes {fir.bindc_name = "TEST", omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, target_cpu = "generic-hsa"} {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 {bindc_name = "threads"} : (i64) -> !llvm.ptr<5>
+    %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
+    %3 = llvm.mlir.constant(1 : i64) : i64
+    %4 = llvm.alloca %3 x i32 {bindc_name = "capture"} : (i64) -> !llvm.ptr<5>
+    %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr
+    %6 = llvm.mlir.constant(1 : i64) : i64
+    %7 = llvm.alloca %6 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr<5>
+    %8 = llvm.addrspacecast %7 : !llvm.ptr<5> to !llvm.ptr
+    %9 = llvm.mlir.constant(0 : i32) : i32
+    %10 = llvm.mlir.constant(128 : i32) : i32
+    %11 = llvm.mlir.constant(1 : i64) : i64
+    %12 = llvm.mlir.constant(1 : i64) : i64
+    %13 = llvm.mlir.constant(1 : i64) : i64
+    llvm.store %10, %2 : i32, !llvm.ptr
+    llvm.store %9, %8 : i32, !llvm.ptr
+    %14 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "threads"}
+    %15 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "capture"}
+    %16 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "a"}
+    omp.target map_entries(%14 -> %arg0, %15 -> %arg1, %16 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+      %17 = llvm.mlir.constant(1 : i32) : i32
+      %18 = llvm.load %arg0 : !llvm.ptr -> i32
+      omp.parallel num_threads(%18 : i32) {
+        omp.atomic.capture {
+          omp.atomic.read %arg1 = %arg2 : !llvm.ptr, !llvm.ptr, i32
+          omp.atomic.update %arg2 : !llvm.ptr {
+          ^bb0(%arg3: i32):
+            %19 = llvm.add %arg3, %17 : i32
+            omp.yield(%19 : i32)
+          } {atomic_control = #omp.atomic_control<fine_grained_memory = true>}
+        }
+        omp.terminator
+      }
+      omp.terminator
+    }
+    llvm.return
+  }
+}
diff --git a/mlir/test/Target/LLVMIR/omptarget-atomic-update-control-options.mlir b/mlir/test/Target/LLVMIR/omptarget-atomic-update-control-options.mlir
new file mode 100644
index 0000000..3b0005b
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-atomic-update-control-options.mlir
@@ -0,0 +1,36 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// CHECK: atomicrmw add ptr %loadgep_, i32 1 monotonic, align 4, !amdgpu.ignore.denormal.mode !{{.*}}, !amdgpu.no.fine.grained.memory !{{.*}}, !amdgpu.no.remote.memory !{{.*}}
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, !llvm.ptr<1> = dense<64> : vector<4xi64>, !llvm.ptr<2> = dense<32> : vector<4xi64>, !llvm.ptr<3> = dense<32> : vector<4xi64>, !llvm.ptr<4> = dense<64> : vector<4xi64>, !llvm.ptr<5> = dense<32> : vector<4xi64>, !llvm.ptr<6> = dense<32> : vector<4xi64>, !llvm.ptr<7> = dense<[160, 256, 256, 32]> : vector<4xi64>, !llvm.ptr<8> = dense<[128, 128, 128, 48]> : vector<4xi64>, !llvm.ptr<9> = dense<[192, 256, 256, 32]> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.legal_int_widths" = array<i32: 32, 64>, "dlti.stack_alignment" = 32 : i64, "dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, fir.atomic_ignore_denormal_mode, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", fir.target_cpu = "generic-hsa", llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9", llvm.target_triple = "amdgcn-amd-amdhsa", omp.flags = #omp.flags<openmp_device_version = 31>, omp.is_gpu = true, omp.is_target_device = true, omp.requires = #omp<clause_requires none>, omp.target_triples = [], omp.version = #omp.version<version = 31>} {
+  llvm.func @_QQmain() attributes {fir.bindc_name = "TEST", omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, target_cpu = "generic-hsa"} {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 {bindc_name = "threads"} : (i64) -> !llvm.ptr<5>
+    %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
+    %3 = llvm.mlir.constant(1 : i64) : i64
+    %4 = llvm.alloca %3 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr<5>
+    %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr
+    %6 = llvm.mlir.constant(0 : i32) : i32
+    %7 = llvm.mlir.constant(128 : i32) : i32
+    %8 = llvm.mlir.constant(1 : i64) : i64
+    %9 = llvm.mlir.constant(1 : i64) : i64
+    llvm.store %7, %2 : i32, !llvm.ptr
+    llvm.store %6, %5 : i32, !llvm.ptr
+    %10 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "threads"}
+    %11 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "a"}
+    omp.target map_entries(%10 -> %arg0, %11 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+      %12 = llvm.mlir.constant(1 : i32) : i32
+      %13 = llvm.load %arg0 : !llvm.ptr -> i32
+      omp.parallel num_threads(%13 : i32) {
+        omp.atomic.update %arg1 : !llvm.ptr {
+        ^bb0(%arg2: i32):
+          %14 = llvm.add %arg2, %12 : i32
+          omp.yield(%14 : i32)
+        } {atomic_control = #omp.atomic_control<ignore_denormal_mode = true>}
+        omp.terminator
+      }
+      omp.terminator
+    }
+    llvm.return
+  }
+}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 740990a..ce43941 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -125,6 +125,23 @@ llvm.func @rocdl.ballot64(%pred : i1) -> i64 {
   llvm.return %0 : i64
 }
 
+llvm.func @rocdl.readfirstlane(%src0 : f32, %src1: f64, %src2: i32, %src3: vector<2 x f32>) -> f32 {
+  // CHECK-LABEL: rocdl.readfirstlane
+  // CHECK: call float @llvm.amdgcn.readfirstlane.f32(float %{{.*}})
+  %0 = rocdl.readfirstlane %src0 : f32
+
+  // CHECK: call double @llvm.amdgcn.readfirstlane.f64(double %{{.*}})
+  %1 = rocdl.readfirstlane %src1 : f64
+
+  // CHECK: call i32 @llvm.amdgcn.readfirstlane.i32(i32 %{{.*}})
+  %2 = rocdl.readfirstlane %src2 : i32
+
+  // CHECK: call <2 x float> @llvm.amdgcn.readfirstlane.v2f32(<2 x float> %{{.*}})
+  %3 = rocdl.readfirstlane %src3 : vector<2 x f32>
+
+  llvm.return %0 : f32
+}
+
 llvm.func @rocdl.readlane(%src0 : f32, %src1: f64, %src2: i32, %src3: vector<2 x f32>) -> f32 {
   %idx = llvm.mlir.constant(0 : i32) : i32
 
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 843bd30..3cdd2f2 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -1169,6 +1169,26 @@ def OpP : TEST_Op<"op_p"> {
   let results = (outs I32);
 }
 
+// Test constant-folding a pattern that maps `(F32) -> SI32`.
+def SignOp : TEST_Op<"sign", [SameOperandsAndResultShape]> {
+  let arguments = (ins RankedTensorOf<[F32]>:$operand);
+  let results = (outs RankedTensorOf<[SI32]>:$result);
+
+  let assemblyFormat = [{
+    $operand attr-dict `:` functional-type(operands, results)
+  }];
+}
+
+// Test constant-folding a pattern that maps `(F32, F32) -> I1`.
+def LessThanOp : TEST_Op<"less_than", [SameOperandsAndResultShape]> {
+  let arguments = (ins RankedTensorOf<[F32]>:$lhs, RankedTensorOf<[F32]>:$rhs);
+  let results = (outs RankedTensorOf<[I1]>:$result);
+
+  let assemblyFormat = [{
+    $lhs `,` $rhs attr-dict `:` functional-type(operands, results)
+  }];
+}
+
 // Test same operand name enforces equality condition check.
 def TestEqualArgsPattern : Pat<(OpN $a, $a), (OpO $a)>;
 
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index 7150401..f92f098 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -10,6 +10,7 @@
 #include "TestOps.h"
 #include "TestTypes.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/CommonFolders.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -202,6 +203,66 @@ struct HoistEligibleOps : public OpRewritePattern<test::OneRegionOp> {
   }
 };
 
+struct FoldSignOpF32ToSI32 : public OpRewritePattern<test::SignOp> {
+  using OpRewritePattern<test::SignOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(test::SignOp op,
+                                PatternRewriter &rewriter) const override {
+    if (op->getNumOperands() != 1 || op->getNumResults() != 1)
+      return failure();
+
+    TypedAttr operandAttr;
+    matchPattern(op->getOperand(0), m_Constant(&operandAttr));
+    if (!operandAttr)
+      return failure();
+
+    TypedAttr res = cast_or_null<TypedAttr>(
+        constFoldUnaryOp<FloatAttr, FloatAttr::ValueType, void, IntegerAttr>(
+            operandAttr, op.getType(), [](APFloat operand) -> APSInt {
+              static const APFloat zero(0.0f);
+              int operandSign = 0;
+              if (operand != zero)
+                operandSign = (operand < zero) ? -1 : +1;
+              return APSInt(APInt(32, operandSign), false);
+            }));
+    if (!res)
+      return failure();
+
+    rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, res);
+    return success();
+  }
+};
+
+struct FoldLessThanOpF32ToI1 : public OpRewritePattern<test::LessThanOp> {
+  using OpRewritePattern<test::LessThanOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(test::LessThanOp op,
+                                PatternRewriter &rewriter) const override {
+    if (op->getNumOperands() != 2 || op->getNumResults() != 1)
+      return failure();
+
+    TypedAttr lhsAttr;
+    TypedAttr rhsAttr;
+    matchPattern(op->getOperand(0), m_Constant(&lhsAttr));
+    matchPattern(op->getOperand(1), m_Constant(&rhsAttr));
+
+    if (!lhsAttr || !rhsAttr)
+      return failure();
+
+    Attribute operandAttrs[2] = {lhsAttr, rhsAttr};
+    TypedAttr res = cast_or_null<TypedAttr>(
+        constFoldBinaryOp<FloatAttr, FloatAttr::ValueType, void, IntegerAttr>(
+            operandAttrs, op.getType(), [](APFloat lhs, APFloat rhs) -> APInt {
+              return APInt(1, lhs < rhs);
+            }));
+    if (!res)
+      return failure();
+
+    rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, res);
+    return success();
+  }
+};
+
 /// This pattern moves "test.move_before_parent_op" before the parent op.
 struct MoveBeforeParentOp : public RewritePattern {
   MoveBeforeParentOp(MLIRContext *context)
@@ -2226,6 +2287,24 @@ struct TestSelectiveReplacementPatternDriver
     (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
 };
+
+struct TestFoldTypeConvertingOp
+    : public PassWrapper<TestFoldTypeConvertingOp, OperationPass<>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestFoldTypeConvertingOp)
+
+  StringRef getArgument() const final { return "test-fold-type-converting-op"; }
+  StringRef getDescription() const final {
+    return "Test helper functions for folding ops whose input and output types "
+           "differ, e.g. float comparisons of the form `(f32, f32) -> i1`.";
+  }
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    mlir::RewritePatternSet patterns(context);
+    patterns.add<FoldSignOpF32ToSI32, FoldLessThanOpF32ToI1>(context);
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
+      signalPassFailure();
+  }
+};
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -2256,6 +2335,8 @@ void registerPatternsTestPass() {
 
   PassRegistration<TestMergeBlocksPatternDriver>();
   PassRegistration<TestSelectiveReplacementPatternDriver>();
+
+  PassRegistration<TestFoldTypeConvertingOp>();
 }
 } // namespace test
 } // namespace mlir
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 6486b2b..272a12a 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -72,6 +72,8 @@ struct ol_queue_impl_t {
 struct ol_event_impl_t {
   ol_event_impl_t(void *EventInfo, ol_queue_handle_t Queue)
       : EventInfo(EventInfo), Queue(Queue) {}
+  // EventInfo may be null, in which case the event should be considered always
+  // complete
   void *EventInfo;
   ol_queue_handle_t Queue;
 };
@@ -509,8 +511,8 @@ Error olWaitEvents_impl(ol_queue_handle_t Queue, ol_event_handle_t *Events,
       return Plugin::error(ErrorCode::INVALID_NULL_HANDLE,
                            "olWaitEvents asked to wait on a NULL event");
 
-    // Do nothing if the event is for this queue
-    if (Event->Queue == Queue)
+    // Do nothing if the event is for this queue or the event is always complete
+    if (Event->Queue == Queue || !Event->EventInfo)
       continue;
 
     if (auto Err = Device->waitEvent(Event->EventInfo, Queue->AsyncInfo))
@@ -548,6 +550,10 @@ Error olGetQueueInfoSize_impl(ol_queue_handle_t Queue, ol_queue_info_t PropName,
 }
 
 Error olSyncEvent_impl(ol_event_handle_t Event) {
+  if (!Event->EventInfo)
+    // Event always complete
+    return Plugin::success();
+
   if (auto Res = Event->Queue->Device->Device->syncEvent(Event->EventInfo))
     return Res;
 
@@ -555,8 +561,9 @@ Error olSyncEvent_impl(ol_event_handle_t Event) {
 }
 
 Error olDestroyEvent_impl(ol_event_handle_t Event) {
-  if (auto Res = Event->Queue->Device->Device->destroyEvent(Event->EventInfo))
-    return Res;
+  if (Event->EventInfo)
+    if (auto Res = Event->Queue->Device->Device->destroyEvent(Event->EventInfo))
+      return Res;
 
   return olDestroy(Event);
 }
@@ -590,7 +597,16 @@ Error olGetEventInfoSize_impl(ol_event_handle_t Event, ol_event_info_t PropName,
 }
 
 Error olCreateEvent_impl(ol_queue_handle_t Queue, ol_event_handle_t *EventOut) {
+  auto Pending = Queue->Device->Device->hasPendingWork(Queue->AsyncInfo);
+  if (auto Err = Pending.takeError())
+    return Err;
+
   *EventOut = new ol_event_impl_t(nullptr, Queue);
+  if (!*Pending)
+    // Queue is empty, don't record an event and consider the event always
+    // complete
+    return Plugin::success();
+
   if (auto Res = Queue->Device->Device->createEvent(&(*EventOut)->EventInfo))
     return Res;
 
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 82deeb4..852c0e9 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2591,6 +2591,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Event->wait(*Stream);
   }
 
+  Expected<bool> hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfo) override {
+    auto Stream = AsyncInfo.getQueueAs<AMDGPUStreamTy *>();
+    if (!Stream)
+      return false;
+
+    auto Query = Stream->query();
+    if (Query)
+      return !*Query;
+    return Query.takeError();
+  }
+
   /// Synchronize the current thread with the event.
   Error syncEventImpl(void *EventPtr) override {
     AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 4b7d410..1d64193 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -973,6 +973,14 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Error printInfo();
   virtual Expected<InfoTreeNode> obtainInfoImpl() = 0;
 
+  /// Return true if the device has work that is either queued or currently
+  /// running
+  ///
+  /// Devices which cannot report this information should always return true
+  Expected<bool> hasPendingWork(__tgt_async_info *AsyncInfo);
+  virtual Expected<bool>
+  hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
   /// Getters of the grid values.
   uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; }
   uint32_t getThreadLimit() const { return GridValues.GV_Max_WG_Size; }
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 0a57046..bcc9179 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1626,6 +1626,21 @@ Error GenericDeviceTy::waitEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
   return Err;
 }
 
+Expected<bool> GenericDeviceTy::hasPendingWork(__tgt_async_info *AsyncInfo) {
+  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+  auto Res = hasPendingWorkImpl(AsyncInfoWrapper);
+  if (auto Err = Res.takeError()) {
+    AsyncInfoWrapper.finalize(Err);
+    return Err;
+  }
+
+  auto Err = Plugin::success();
+  AsyncInfoWrapper.finalize(Err);
+  if (Err)
+    return Err;
+  return Res;
+}
+
 Error GenericDeviceTy::syncEvent(void *EventPtr) {
   return syncEventImpl(EventPtr);
 }
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index c5f3167..7649fd9 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -916,6 +916,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::check(Res, "error in cuStreamWaitEvent: %s");
   }
 
+  // TODO: This should be implementable on CUDA
+  Expected<bool> hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfo) override {
+    return true;
+  }
+
   /// Synchronize the current thread with the event.
   Error syncEventImpl(void *EventPtr) override {
     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index d950572..9abc350 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -333,6 +333,9 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
                       AsyncInfoWrapperTy &AsyncInfoWrapper) override {
     return Plugin::success();
   }
+  Expected<bool> hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfo) override {
+    return true;
+  }
   Error syncEventImpl(void *EventPtr) override { return Plugin::success(); }
 
   /// Print information about the device.
diff --git a/offload/unittests/CMakeLists.txt b/offload/unittests/CMakeLists.txt
index 726d3ee..a0d5c01 100644
--- a/offload/unittests/CMakeLists.txt
+++ b/offload/unittests/CMakeLists.txt
@@ -104,11 +104,6 @@ function(add_conformance_test test_name)
 
   list(TRANSFORM ARGN PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/" OUTPUT_VARIABLE files)
 
-  if(NOT TARGET libc)
-    message(WARNING "Cannot run conformance tests without the LLVM C library")
-    return()
-  endif()
-
   add_executable(${target_name} ${files})
   add_dependencies(${target_name} conformance_device_binaries)
   target_compile_definitions(${target_name}
diff --git a/offload/unittests/Conformance/tests/CMakeLists.txt b/offload/unittests/Conformance/tests/CMakeLists.txt
index 5c0c3ba..8c0109ba 100644
--- a/offload/unittests/Conformance/tests/CMakeLists.txt
+++ b/offload/unittests/Conformance/tests/CMakeLists.txt
@@ -1,3 +1,8 @@
+if(NOT TARGET libc)
+    message(WARNING "Cannot run conformance tests without the LLVM C library")
+    return()
+endif()
+
 add_conformance_test(acosf AcosfTest.cpp)
 add_conformance_test(acoshf AcoshfTest.cpp)
 add_conformance_test(asinf AsinfTest.cpp)
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 9fe42c6d..d95a37f 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -382,6 +382,7 @@ cc_library(
         "//mlir:BufferizationInterfaces",
         "//mlir:BytecodeOpInterface",
         "//mlir:CallOpInterfaces",
+        "//mlir:CommonFolders",
         "//mlir:ControlFlowInterfaces",
         "//mlir:CopyOpInterface",
         "//mlir:DLTIDialect",