[𝘀𝗽𝗿] changes introduced through rebaseusers/vitalybuka/spr/main.ubsanhwasan-let-mixing-filters

Created using spr 1.3.4 [skip ci]
author: Vitaly Buka <vitalybuka@google.com> 2024-07-26 10:20:26 -0700
committer: Vitaly Buka <vitalybuka@google.com> 2024-07-26 10:20:26 -0700
commit: f75b8a41bbf63c4424af9cc289ce958ba3364342 (patch)
tree: a7d5a2029b12fd7e0ea1ec129204da5498e9cd8e
parent: 7c586d05f48f39515fd981a44d75efb5356975c2 (diff)
parent: d683d378998c85c12d7f0549944f807bb44c7b76 (diff)
download: llvm-users/vitalybuka/spr/main.ubsanhwasan-let-mixing-filters.zip
llvm-users/vitalybuka/spr/main.ubsanhwasan-let-mixing-filters.tar.gz
llvm-users/vitalybuka/spr/main.ubsanhwasan-let-mixing-filters.tar.bz2
342 files changed, 7159 insertions, 2266 deletions
diff --git a/.github/workflows/release-asset-audit.py b/.github/workflows/release-asset-audit.py
new file mode 100644
index 0000000..355e7fe
--- /dev/null
+++ b/.github/workflows/release-asset-audit.py
@@ -0,0 +1,51 @@
+import github
+import sys
+
+def main():
+    token = sys.argv[1]
+
+    gh = github.Github(login_or_token=token)
+    repo = gh.get_repo("llvm/llvm-project")
+
+    uploaders = set(
+        [
+            "DimitryAndric",
+            "stefanp-ibm",
+            "lei137",
+            "omjavaid",
+            "nicolerabjohn",
+            "amy-kwan",
+            "mandlebug",
+            "zmodem",
+            "androm3da",
+            "tru",
+            "rovka",
+            "rorth",
+            "quinnlp",
+            "kamaub",
+            "abrisco",
+            "jakeegan",
+            "maryammo",
+            "tstellar",
+            "github-actions[bot]",
+        ]
+    )
+
+    for release in repo.get_releases():
+        print("Release:", release.title)
+        for asset in release.get_assets():
+            created_at = asset.created_at
+            updated_at = (
+                "" if asset.created_at == asset.updated_at else asset.updated_at
+            )
+            print(
+                f"{asset.name} : {asset.uploader.login} [{created_at} {updated_at}] ( {asset.download_count} )"
+            )
+            if asset.uploader.login not in uploaders:
+                with open('comment', 'w') as file:
+                    file.write(f'@{asset.uploader.login} is not a valid uploader.')
+                sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/release-asset-audit.yml b/.github/workflows/release-asset-audit.yml
new file mode 100644
index 0000000..fd42bc6
--- /dev/null
+++ b/.github/workflows/release-asset-audit.yml
@@ -0,0 +1,54 @@
+name: Release Asset Audit
+
+on:
+  workflow_dispatch:
+  release:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    # Run once an hour
+    - cron:  '5 * * * *'
+
+  pull_request:
+    paths:
+      - ".github/workflows/release-asset-audit.py"
+      - ".github/workflows/release-asset-audit.yml"
+
+permissions:
+  contents: read # Default everything to read-only
+
+jobs:
+  audit:
+    name: "Release Asset Audit"
+    runs-on: ubuntu-22.04
+    if: github.repository == 'llvm/llvm-project'
+    steps:
+      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 #v4.1.6
+      - name: "Run Audit Script"
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+        run: |
+          pip install --require-hashes -r ./llvm/utils/git/requirements.txt
+          python3 ./.github/workflows/release-asset-audit.py $GITHUB_TOKEN
+      - name: "File Issue"
+        if: >-
+          github.event_name != 'pull_request' &&
+          failure()
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
+        with:
+          github-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}
+          script: |
+            var fs = require('fs');
+            var body = ''
+            if (fs.existsSync('./comment')) {
+              body = JSON.parse(fs.readFileSync('./comment')) + "\n\n";
+            }
+            body = body + `\n\nhttps://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`
+
+            const issue = await github.rest.issues.create({
+               owner: context.repo.owner,
+               repo: context.repo.repo,
+               title: "Release Asset Audit Failed",
+               labels: ['infrastructure'],
+               body: body
+            });
+            console.log(issue);
diff --git a/.github/workflows/release-sources.yml b/.github/workflows/release-sources.yml
index 9c5b1a9..b0c0b65 100644
--- a/.github/workflows/release-sources.yml
+++ b/.github/workflows/release-sources.yml
@@ -47,7 +47,7 @@ jobs:
     steps:
       - id: inputs
         run: |
-          ref=${{ inputs.release-version || github.sha }}
+          ref=${{ (inputs.release-version && format('llvmorg-{0}', inputs.release-version)) || github.sha }}
           if [ -n "${{ inputs.release-version }}" ]; then
             export_args="-release ${{ inputs.release-version }} -final"
           else
diff --git a/clang-tools-extra/clang-tidy/add_new_check.py b/clang-tools-extra/clang-tidy/add_new_check.py
index 3a62df1..1ce2019 100755
--- a/clang-tools-extra/clang-tidy/add_new_check.py
+++ b/clang-tools-extra/clang-tidy/add_new_check.py
@@ -16,6 +16,7 @@ import io
 import os
 import re
 import sys
+import textwrap
 
 # Adapts the module's CMakelist file. Returns 'True' if it could add a new
 # entry and 'False' if the entry already existed.
@@ -53,7 +54,14 @@ def adapt_cmake(module_path, check_name_camel):
 
 
 # Adds a header for the new check.
-def write_header(module_path, module, namespace, check_name, check_name_camel):
+def write_header(
+    module_path, module, namespace, check_name, check_name_camel, description
+):
+    wrapped_desc = "\n".join(
+        textwrap.wrap(
+            description, width=80, initial_indent="/// ", subsequent_indent="/// "
+        )
+    )
     filename = os.path.join(module_path, check_name_camel) + ".h"
     print("Creating %s..." % filename)
     with io.open(filename, "w", encoding="utf8", newline="\n") as f:
@@ -85,7 +93,7 @@ def write_header(module_path, module, namespace, check_name, check_name_camel):
 
 namespace clang::tidy::%(namespace)s {
 
-/// FIXME: Write a short description.
+%(description)s
 ///
 /// For the user-facing documentation see:
 /// http://clang.llvm.org/extra/clang-tidy/checks/%(module)s/%(check_name)s.html
@@ -107,6 +115,7 @@ public:
                 "check_name": check_name,
                 "module": module,
                 "namespace": namespace,
+                "description": wrapped_desc,
             }
         )
 
@@ -235,7 +244,12 @@ def adapt_module(module_path, module, check_name, check_name_camel):
 
 
 # Adds a release notes entry.
-def add_release_notes(module_path, module, check_name):
+def add_release_notes(module_path, module, check_name, description):
+    wrapped_desc = "\n".join(
+        textwrap.wrap(
+            description, width=80, initial_indent="  ", subsequent_indent="  "
+        )
+    )
     check_name_dashes = module + "-" + check_name
     filename = os.path.normpath(
         os.path.join(module_path, "../../docs/ReleaseNotes.rst")
@@ -281,10 +295,10 @@ def add_release_notes(module_path, module, check_name):
                             """- New :doc:`%s
   <clang-tidy/checks/%s/%s>` check.
 
-  FIXME: add release notes.
+%s
 
 """
-                            % (check_name_dashes, module, check_name)
+                            % (check_name_dashes, module, check_name, wrapped_desc)
                         )
                         note_added = True
 
@@ -613,6 +627,13 @@ def main():
         metavar="LANG",
     )
     parser.add_argument(
+        "--description",
+        "-d",
+        help="short description of what the check does",
+        default="FIXME: Write a short description",
+        type=str,
+    )
+    parser.add_argument(
         "module",
         nargs="?",
         help="module directory under which to place the new tidy check (e.g., misc)",
@@ -652,10 +673,16 @@ def main():
     else:
         namespace = module
 
-    write_header(module_path, module, namespace, check_name, check_name_camel)
+    description = args.description
+    if not description.endswith("."):
+        description += "."
+
+    write_header(
+        module_path, module, namespace, check_name, check_name_camel, description
+    )
     write_implementation(module_path, module, namespace, check_name_camel)
     adapt_module(module_path, module, check_name, check_name_camel)
-    add_release_notes(module_path, module, check_name)
+    add_release_notes(module_path, module, check_name, description)
     test_extension = language_to_extension.get(args.language)
     write_test(module_path, module, check_name, test_extension)
     write_docs(module_path, module, check_name)
diff --git a/clang-tools-extra/test/clang-doc/basic-project.test b/clang-tools-extra/test/clang-doc/basic-project.test
index 51d3ac6..38569d8 100644
--- a/clang-tools-extra/test/clang-doc/basic-project.test
+++ b/clang-tools-extra/test/clang-doc/basic-project.test
@@ -1,3 +1,6 @@
+// See https://github.com/llvm/llvm-project/issues/97507.
+// UNSUPPORTED: target={{.*}}
+
 // RUN: rm -rf %t && mkdir -p %t/docs %t/build
 // RUN: sed 's|$test_dir|%/S|g' %S/Inputs/basic-project/database_template.json > %t/build/compile_commands.json
 // RUN: clang-doc --format=html --output=%t/docs --executor=all-TUs %t/build/compile_commands.json
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 81784c7..a747464 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -1546,6 +1546,7 @@ The following type trait primitives are supported by Clang. Those traits marked
 * ``__array_extent(type, dim)`` (Embarcadero):
   The ``dim``'th array bound in the type ``type``, or ``0`` if
   ``dim >= __array_rank(type)``.
+* ``__builtin_is_virtual_base_of`` (C++, GNU, Microsoft)
 * ``__can_pass_in_regs`` (C++)
   Returns whether a class can be passed in registers under the current
   ABI. This type can only be applied to unqualified class types.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 5dddd8f..286f319 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -81,6 +81,9 @@ C++23 Feature Support
 C++2c Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
+- Add ``__builtin_is_virtual_base_of`` intrinsic, which supports
+  `P2985R0 A type trait for detecting virtual base classes <https://wg21.link/p2985r0>`_
+
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -163,6 +166,9 @@ Miscellaneous Bug Fixes
 Miscellaneous Clang Crashes Fixed
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Fixed a crash in C due to incorrect lookup that members in nested anonymous struct/union
+  can be found as ordinary identifiers in struct/union definition. (#GH31295)
+
 OpenACC Specific Changes
 ------------------------
 
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 72723c7..89a74ff 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -4698,26 +4698,25 @@ public:
   };
 
 private:
-  LLVM_PREFERRED_TYPE(Kind)
-  unsigned FKind : 3;
+  Kind FKind;
 
   // Expansion: for hypothetical TCB+types, there could be one Kind for TCB,
   // then ~16(?) bits "SubKind" to map to a specific named TCB. SubKind would
   // be considered for uniqueness.
 
 public:
-  FunctionEffect() : FKind(unsigned(Kind::None)) {}
+  FunctionEffect() : FKind(Kind::None) {}
 
-  explicit FunctionEffect(Kind K) : FKind(unsigned(K)) {}
+  explicit FunctionEffect(Kind K) : FKind(K) {}
 
   /// The kind of the effect.
-  Kind kind() const { return Kind(FKind); }
+  Kind kind() const { return FKind; }
 
   /// Return the opposite kind, for effects which have opposites.
   Kind oppositeKind() const;
 
   /// For serialization.
-  uint32_t toOpaqueInt32() const { return FKind; }
+  uint32_t toOpaqueInt32() const { return uint32_t(FKind); }
   static FunctionEffect fromOpaqueInt32(uint32_t Value) {
     return FunctionEffect(Kind(Value));
   }
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 810abe4..beee243 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3812,8 +3812,6 @@ def warn_sme_locally_streaming_has_vl_args_returns : Warning<
   InGroup<AArch64SMEAttributes>, DefaultIgnore;
 def err_conflicting_attributes_arm_state : Error<
   "conflicting attributes for state '%0'">;
-def err_sme_streaming_cannot_be_multiversioned : Error<
-  "streaming function cannot be multi-versioned">;
 def err_unknown_arm_state : Error<
   "unknown state '%0'">;
 def err_missing_arm_state : Error<
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 834a6f6..0035092c 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -224,7 +224,6 @@ COMPATIBLE_LANGOPT(GNUInline         , 1, 0, "GNU inline semantics")
 COMPATIBLE_LANGOPT(NoInlineDefine    , 1, 0, "__NO_INLINE__ predefined macro")
 COMPATIBLE_LANGOPT(Deprecated        , 1, 0, "__DEPRECATED predefined macro")
 COMPATIBLE_LANGOPT(FastMath          , 1, 0, "fast FP math optimizations, and __FAST_MATH__ predefined macro")
-COMPATIBLE_LANGOPT(FiniteMathOnly    , 1, 0, "__FINITE_MATH_ONLY__ predefined macro")
 COMPATIBLE_LANGOPT(UnsafeFPMath      , 1, 0, "Unsafe Floating Point Math")
 COMPATIBLE_LANGOPT(ProtectParens     , 1, 0, "optimizer honors parentheses "
                    "when floating-point expressions are evaluated")
@@ -340,7 +339,6 @@ LANGOPT(SinglePrecisionConstants , 1, 0, "treating double-precision floating poi
 LANGOPT(FastRelaxedMath , 1, 0, "OpenCL fast relaxed math")
 BENIGN_LANGOPT(CLNoSignedZero , 1, 0, "Permit Floating Point optimization without regard to signed zeros")
 COMPATIBLE_LANGOPT(CLUnsafeMath , 1, 0, "Unsafe Floating Point Math")
-COMPATIBLE_LANGOPT(CLFiniteMathOnly , 1, 0, "__FINITE_MATH_ONLY__ predefined macro")
 /// FP_CONTRACT mode (on/off/fast).
 BENIGN_ENUM_LANGOPT(DefaultFPContractMode, FPModeKind, 2, FPM_Off, "FP contraction type")
 COMPATIBLE_LANGOPT(ExpStrictFP, 1, false, "Enable experimental strict floating point")
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 8c54661..7e638dc 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -501,6 +501,7 @@ TYPE_TRAIT_1(__has_trivial_move_assign, HasTrivialMoveAssign, KEYCXX)
 TYPE_TRAIT_1(__has_trivial_move_constructor, HasTrivialMoveConstructor, KEYCXX)
 
 // GNU and MS Type Traits
+TYPE_TRAIT_2(__builtin_is_virtual_base_of, IsVirtualBaseOf, KEYCXX)
 TYPE_TRAIT_1(__has_nothrow_assign, HasNothrowAssign, KEYCXX)
 TYPE_TRAIT_1(__has_nothrow_copy, HasNothrowCopy, KEYCXX)
 TYPE_TRAIT_1(__has_nothrow_constructor, HasNothrowConstructor, KEYCXX)
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 26811bf..ccccc95 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1117,8 +1117,7 @@ def cl_single_precision_constant : Flag<["-"], "cl-single-precision-constant">,
   MarshallingInfoFlag<LangOpts<"SinglePrecisionConstants">>;
 def cl_finite_math_only : Flag<["-"], "cl-finite-math-only">, Group<opencl_Group>,
   Visibility<[ClangOption, CC1Option]>,
-  HelpText<"OpenCL only. Allow floating-point optimizations that assume arguments and results are not NaNs or +-Inf.">,
-  MarshallingInfoFlag<LangOpts<"CLFiniteMathOnly">>;
+  HelpText<"OpenCL only. Allow floating-point optimizations that assume arguments and results are not NaNs or +-Inf.">;
 def cl_kernel_arg_info : Flag<["-"], "cl-kernel-arg-info">, Group<opencl_Group>,
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"OpenCL only. Generate kernel argument metadata.">,
@@ -2609,13 +2608,12 @@ defm approx_func : BoolFOption<"approx-func", LangOpts<"ApproxFunc">, DefaultFal
            "with an approximately equivalent calculation",
            [funsafe_math_optimizations.KeyPath]>,
    NegFlag<SetFalse, [], [ClangOption, CC1Option, FC1Option, FlangOption]>>;
-defm finite_math_only : BoolFOption<"finite-math-only",
-  LangOpts<"FiniteMathOnly">, DefaultFalse,
+defm finite_math_only : BoolOptionWithoutMarshalling<"f", "finite-math-only",
   PosFlag<SetTrue, [], [ClangOption, CC1Option],
           "Allow floating-point optimizations that "
           "assume arguments and results are not NaNs or +-inf. This defines "
           "the \\_\\_FINITE\\_MATH\\_ONLY\\_\\_ preprocessor macro.",
-          [cl_finite_math_only.KeyPath, ffast_math.KeyPath]>,
+          [ffast_math.KeyPath]>,
   NegFlag<SetFalse>>;
 defm signed_zeros : BoolFOption<"signed-zeros",
   LangOpts<"NoSignedZero">, DefaultFalse,
@@ -7815,10 +7813,10 @@ def mreassociate : Flag<["-"], "mreassociate">,
   MarshallingInfoFlag<LangOpts<"AllowFPReassoc">>, ImpliedByAnyOf<[funsafe_math_optimizations.KeyPath]>;
 def menable_no_nans : Flag<["-"], "menable-no-nans">,
   HelpText<"Allow optimization to assume there are no NaNs.">,
-  MarshallingInfoFlag<LangOpts<"NoHonorNaNs">>, ImpliedByAnyOf<[ffinite_math_only.KeyPath]>;
-def menable_no_infinities : Flag<["-"], "menable-no-infs">,
+  MarshallingInfoFlag<LangOpts<"NoHonorNaNs">>, ImpliedByAnyOf<[ffast_math.KeyPath]>;
+def menable_no_infs : Flag<["-"], "menable-no-infs">,
   HelpText<"Allow optimization to assume there are no infinities.">,
-  MarshallingInfoFlag<LangOpts<"NoHonorInfs">>, ImpliedByAnyOf<[ffinite_math_only.KeyPath]>;
+  MarshallingInfoFlag<LangOpts<"NoHonorInfs">>, ImpliedByAnyOf<[ffast_math.KeyPath]>;
 
 def pic_level : Separate<["-"], "pic-level">,
   HelpText<"Value for __PIC__">,
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index ec5dbd2..38b55a0 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -1045,18 +1045,6 @@ def MallocOverflowSecurityChecker : Checker<"MallocOverflow">,
 
 def MmapWriteExecChecker : Checker<"MmapWriteExec">,
   HelpText<"Warn on mmap() calls that are both writable and executable">,
-  CheckerOptions<[
-    CmdLineOption<Integer,
-                  "MmapProtExec",
-                  "Specifies the value of PROT_EXEC",
-                  "0x04",
-                  Released>,
-    CmdLineOption<Integer,
-                  "MmapProtRead",
-                  "Specifies the value of PROT_READ",
-                  "0x01",
-                  Released>
-  ]>,
   Documentation<HasDocumentation>;
 
 def ReturnPointerRangeChecker : Checker<"ReturnPtrRange">,
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index bc5a920..a1f7054 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -879,8 +879,6 @@ unsigned Decl::getIdentifierNamespaceForKind(Kind DeclKind) {
       return IDNS_Ordinary;
     case Label:
       return IDNS_Label;
-    case IndirectField:
-      return IDNS_Ordinary | IDNS_Member;
 
     case Binding:
     case NonTypeTemplateParm:
@@ -918,6 +916,7 @@ unsigned Decl::getIdentifierNamespaceForKind(Kind DeclKind) {
       return IDNS_ObjCProtocol;
 
     case Field:
+    case IndirectField:
     case ObjCAtDefsField:
     case ObjCIvar:
       return IDNS_Member;
diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp
index 867284e..5e3a5b9 100644
--- a/clang/lib/AST/Interp/Disasm.cpp
+++ b/clang/lib/AST/Interp/Disasm.cpp
@@ -278,10 +278,15 @@ LLVM_DUMP_METHOD void InterpFrame::dump(llvm::raw_ostream &OS,
   OS << "\n";
   OS.indent(Spaces) << "This: " << getThis() << "\n";
   OS.indent(Spaces) << "RVO: " << getRVOPtr() << "\n";
-
-  while (const InterpFrame *F = this->Caller) {
+  OS.indent(Spaces) << "Depth: " << Depth << "\n";
+  OS.indent(Spaces) << "ArgSize: " << ArgSize << "\n";
+  OS.indent(Spaces) << "Args: " << (void *)Args << "\n";
+  OS.indent(Spaces) << "FrameOffset: " << FrameOffset << "\n";
+  OS.indent(Spaces) << "FrameSize: " << (Func ? Func->getFrameSize() : 0)
+                    << "\n";
+
+  for (const InterpFrame *F = this->Caller; F; F = F->Caller) {
     F->dump(OS, Indent + 1);
-    F = F->Caller;
   }
 }
 
diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index c59bbc8..d7538c7 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -1170,27 +1170,30 @@ static bool interp__builtin_constant_p(InterpState &S, CodePtr OpPC,
       Stk.clear();
     }
 
-    const APValue &LV = Res.toAPValue();
-    if (!Res.isInvalid() && LV.isLValue()) {
-      APValue::LValueBase Base = LV.getLValueBase();
-      if (Base.isNull()) {
-        // A null base is acceptable.
-        return returnInt(true);
-      } else if (const auto *E = Base.dyn_cast<const Expr *>()) {
-        if (!isa<StringLiteral>(E))
+    if (!Res.isInvalid() && !Res.empty()) {
+      const APValue &LV = Res.toAPValue();
+      if (LV.isLValue()) {
+        APValue::LValueBase Base = LV.getLValueBase();
+        if (Base.isNull()) {
+          // A null base is acceptable.
+          return returnInt(true);
+        } else if (const auto *E = Base.dyn_cast<const Expr *>()) {
+          if (!isa<StringLiteral>(E))
+            return returnInt(false);
+          return returnInt(LV.getLValueOffset().isZero());
+        } else if (Base.is<TypeInfoLValue>()) {
+          // Surprisingly, GCC considers __builtin_constant_p(&typeid(int)) to
+          // evaluate to true.
+          return returnInt(true);
+        } else {
+          // Any other base is not constant enough for GCC.
           return returnInt(false);
-        return returnInt(LV.getLValueOffset().isZero());
-      } else if (Base.is<TypeInfoLValue>()) {
-        // Surprisingly, GCC considers __builtin_constant_p(&typeid(int)) to
-        // evaluate to true.
-        return returnInt(true);
-      } else {
-        // Any other base is not constant enough for GCC.
-        return returnInt(false);
+        }
       }
     }
 
-    return returnInt(!Res.isInvalid() && !Res.empty());
+    // Otherwise, any constant value is good enough.
+    return returnInt(true);
   }
 
   return returnInt(false);
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index 8d7fe18..e1f68e4 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -416,7 +416,7 @@ public:
     // below them can initialize the same object (or part of it).
     if (isa<CXXConstructExpr>(E) || isa<CallExpr>(E) || isa<LambdaExpr>(E) ||
         isa<CXXDefaultArgExpr>(E) || isa<CXXStdInitializerListExpr>(E) ||
-        isa<AtomicExpr>(E) ||
+        isa<AtomicExpr>(E) || isa<CXXInheritedCtorInitExpr>(E) ||
         // We treat `BuiltinBitCastExpr` as an "original initializer" too as
         // it may not even be casting from a record type -- and even if it is,
         // the two objects are in general of unrelated type.
diff --git a/clang/lib/Basic/LangOptions.cpp b/clang/lib/Basic/LangOptions.cpp
index e5adc03..9331a63 100644
--- a/clang/lib/Basic/LangOptions.cpp
+++ b/clang/lib/Basic/LangOptions.cpp
@@ -34,8 +34,8 @@ void LangOptions::resetNonModularOptions() {
   // invocations that cannot be round-tripped to arguments.
   // FIXME: we should derive this automatically from ImpliedBy in tablegen.
   AllowFPReassoc = UnsafeFPMath;
-  NoHonorNaNs = FiniteMathOnly;
-  NoHonorInfs = FiniteMathOnly;
+  NoHonorInfs = FastMath;
+  NoHonorNaNs = FastMath;
 
   // These options do not affect AST generation.
   NoSanitizeFiles.clear();
diff --git a/clang/lib/Basic/Targets/OSTargets.cpp b/clang/lib/Basic/Targets/OSTargets.cpp
index 899aefa..b56e2c7 100644
--- a/clang/lib/Basic/Targets/OSTargets.cpp
+++ b/clang/lib/Basic/Targets/OSTargets.cpp
@@ -173,10 +173,10 @@ static void addVisualCDefines(const LangOptions &Opts, MacroBuilder &Builder) {
   // "Under /fp:precise and /fp:strict, the compiler doesn't do any mathematical
   // transformation unless the transformation is guaranteed to produce a bitwise
   // identical result."
-  const bool any_imprecise_flags =
-      Opts.FastMath || Opts.FiniteMathOnly || Opts.UnsafeFPMath ||
-      Opts.AllowFPReassoc || Opts.NoHonorNaNs || Opts.NoHonorInfs ||
-      Opts.NoSignedZero || Opts.AllowRecip || Opts.ApproxFunc;
+  const bool any_imprecise_flags = Opts.FastMath || Opts.UnsafeFPMath ||
+                                   Opts.AllowFPReassoc || Opts.NoHonorNaNs ||
+                                   Opts.NoHonorInfs || Opts.NoSignedZero ||
+                                   Opts.AllowRecip || Opts.ApproxFunc;
 
   // "Under both /fp:precise and /fp:fast, the compiler generates code intended
   // to run in the default floating-point environment."
diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index fbf942d0..fbe9569 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -727,7 +727,7 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest,
 
   llvm::Value *LoadVal1 = CGF.Builder.CreateLoad(Val1);
   llvm::AtomicRMWInst *RMWI =
-      CGF.Builder.CreateAtomicRMW(Op, Ptr, LoadVal1, Order, Scope);
+      CGF.emitAtomicRMWInst(Op, Ptr, LoadVal1, Order, Scope);
   RMWI->setVolatile(E->isVolatile());
 
   // For __atomic_*_fetch operations, perform the operation again to
@@ -2034,6 +2034,17 @@ std::pair<RValue, llvm::Value *> CodeGenFunction::EmitAtomicCompareExchange(
                                            IsWeak);
 }
 
+llvm::AtomicRMWInst *
+CodeGenFunction::emitAtomicRMWInst(llvm::AtomicRMWInst::BinOp Op, Address Addr,
+                                   llvm::Value *Val, llvm::AtomicOrdering Order,
+                                   llvm::SyncScope::ID SSID) {
+
+  llvm::AtomicRMWInst *RMW =
+      Builder.CreateAtomicRMW(Op, Addr, Val, Order, SSID);
+  getTargetHooks().setTargetAtomicMetadata(*this, *RMW);
+  return RMW;
+}
+
 void CodeGenFunction::EmitAtomicUpdate(
     LValue LVal, llvm::AtomicOrdering AO,
     const llvm::function_ref<RValue(RValue)> &UpdateOp, bool IsVolatile) {
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index a17d684..84392745 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2841,9 +2841,10 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
           isInc ? llvm::Instruction::FAdd : llvm::Instruction::FSub;
       llvm::Value *amt = llvm::ConstantFP::get(
           VMContext, llvm::APFloat(static_cast<float>(1.0)));
-      llvm::Value *old =
-          Builder.CreateAtomicRMW(aop, LV.getAddress(), amt,
-                                  llvm::AtomicOrdering::SequentiallyConsistent);
+      llvm::AtomicRMWInst *old =
+          CGF.emitAtomicRMWInst(aop, LV.getAddress(), amt,
+                                llvm::AtomicOrdering::SequentiallyConsistent);
+
       return isPre ? Builder.CreateBinOp(op, old, amt) : old;
     }
     value = EmitLoadOfLValue(LV, E->getExprLoc());
@@ -3583,9 +3584,9 @@ LValue ScalarExprEmitter::EmitCompoundAssignLValue(
             EmitScalarConversion(OpInfo.RHS, E->getRHS()->getType(), LHSTy,
                                  E->getExprLoc()),
             LHSTy);
-        Value *OldVal = Builder.CreateAtomicRMW(
-            AtomicOp, LHSLV.getAddress(), Amt,
-            llvm::AtomicOrdering::SequentiallyConsistent);
+
+        llvm::AtomicRMWInst *OldVal =
+            CGF.emitAtomicRMWInst(AtomicOp, LHSLV.getAddress(), Amt);
 
         // Since operation is atomic, the result type is guaranteed to be the
         // same as the input in LLVM terms.
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 853046b..4ee9840 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -6326,8 +6326,8 @@ static std::pair<bool, RValue> emitOMPAtomicRMW(CodeGenFunction &CGF, LValue X,
       UpdateVal = CGF.Builder.CreateCast(llvm::Instruction::CastOps::UIToFP, IC,
                                          X.getAddress().getElementType());
   }
-  llvm::Value *Res =
-      CGF.Builder.CreateAtomicRMW(RMWOp, X.getAddress(), UpdateVal, AO);
+  llvm::AtomicRMWInst *Res =
+      CGF.emitAtomicRMWInst(RMWOp, X.getAddress(), UpdateVal, AO);
   return std::make_pair(true, RValue::get(Res));
 }
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 67e30195..bd62c65 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4160,6 +4160,13 @@ public:
           llvm::AtomicOrdering::SequentiallyConsistent,
       bool IsWeak = false, AggValueSlot Slot = AggValueSlot::ignored());
 
+  /// Emit an atomicrmw instruction, and applying relevant metadata when
+  /// applicable.
+  llvm::AtomicRMWInst *emitAtomicRMWInst(
+      llvm::AtomicRMWInst::BinOp Op, Address Addr, llvm::Value *Val,
+      llvm::AtomicOrdering Order = llvm::AtomicOrdering::SequentiallyConsistent,
+      llvm::SyncScope::ID SSID = llvm::SyncScope::System);
+
   void EmitAtomicUpdate(LValue LVal, llvm::AtomicOrdering AO,
                         const llvm::function_ref<RValue(RValue)> &UpdateOp,
                         bool IsVolatile);
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index 2f21385..8f17c05 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -334,6 +334,10 @@ public:
                                                  llvm::AtomicOrdering Ordering,
                                                  llvm::LLVMContext &Ctx) const;
 
+  /// Allow the target to apply other metadata to an atomic instruction
+  virtual void setTargetAtomicMetadata(CodeGenFunction &CGF,
+                                       llvm::AtomicRMWInst &RMW) const {}
+
   /// Interface class for filling custom fields of a block literal for OpenCL.
   class TargetOpenCLBlockHelper {
   public:
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 4d3275e..37e6af3 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -311,6 +311,8 @@ public:
                                          SyncScope Scope,
                                          llvm::AtomicOrdering Ordering,
                                          llvm::LLVMContext &Ctx) const override;
+  void setTargetAtomicMetadata(CodeGenFunction &CGF,
+                               llvm::AtomicRMWInst &RMW) const override;
   llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
                                          llvm::Function *BlockInvokeFunc,
                                          llvm::Type *BlockTy) const override;
@@ -546,6 +548,23 @@ AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
   return Ctx.getOrInsertSyncScopeID(Name);
 }
 
+void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata(
+    CodeGenFunction &CGF, llvm::AtomicRMWInst &RMW) const {
+  if (!CGF.getTarget().allowAMDGPUUnsafeFPAtomics())
+    return;
+
+  // TODO: Introduce new, more controlled options that also work for integers,
+  // and deprecate allowAMDGPUUnsafeFPAtomics.
+  llvm::AtomicRMWInst::BinOp RMWOp = RMW.getOperation();
+  if (llvm::AtomicRMWInst::isFPOperation(RMWOp)) {
+    llvm::MDNode *Empty = llvm::MDNode::get(CGF.getLLVMContext(), {});
+    RMW.setMetadata("amdgpu.no.fine.grained.memory", Empty);
+
+    if (RMWOp == llvm::AtomicRMWInst::FAdd && RMW.getType()->isFloatTy())
+      RMW.setMetadata("amdgpu.ignore.denormal.mode", Empty);
+  }
+}
+
 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
   return false;
 }
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index f6c7300..f796c31 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -620,8 +620,10 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                   const char *LinkingOutput) const {
   std::string Linker = getToolChain().GetLinkerPath();
   ArgStringList CmdArgs;
-  CmdArgs.push_back("--no-undefined");
-  CmdArgs.push_back("-shared");
+  if (!Args.hasArg(options::OPT_r)) {
+    CmdArgs.push_back("--no-undefined");
+    CmdArgs.push_back("-shared");
+  }
 
   addLinkerCompressDebugSectionsOption(getToolChain(), Args, CmdArgs);
   Args.AddAllArgs(CmdArgs, options::OPT_L);
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 4a94df7..6790079 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3384,9 +3384,28 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
     CmdArgs.push_back("-ffast-math");
 
   // Handle __FINITE_MATH_ONLY__ similarly.
-  if (!HonorINFs && !HonorNaNs)
+  // The -ffinite-math-only is added to CmdArgs when !HonorINFs && !HonorNaNs.
+  // Otherwise process the Xclang arguments to determine if -menable-no-infs and
+  // -menable-no-nans are set by the user.
+  bool shouldAddFiniteMathOnly = false;
+  if (!HonorINFs && !HonorNaNs) {
+    shouldAddFiniteMathOnly = true;
+  } else {
+    bool InfValues = true;
+    bool NanValues = true;
+    for (const auto *Arg : Args.filtered(options::OPT_Xclang)) {
+      StringRef ArgValue = Arg->getValue();
+      if (ArgValue == "-menable-no-nans")
+        NanValues = false;
+      else if (ArgValue == "-menable-no-infs")
+        InfValues = false;
+    }
+    if (!NanValues && !InfValues)
+      shouldAddFiniteMathOnly = true;
+  }
+  if (shouldAddFiniteMathOnly) {
     CmdArgs.push_back("-ffinite-math-only");
-
+  }
   if (const Arg *A = Args.getLastArg(options::OPT_mfpmath_EQ)) {
     CmdArgs.push_back("-mfpmath");
     CmdArgs.push_back(A->getValue());
@@ -3755,6 +3774,11 @@ static void RenderOpenCLOptions(const ArgList &Args, ArgStringList &CmdArgs,
     CmdArgs.push_back(Args.MakeArgString(CLExtStr));
   }
 
+  if (Args.hasArg(options::OPT_cl_finite_math_only)) {
+    CmdArgs.push_back("-menable-no-infs");
+    CmdArgs.push_back("-menable-no-nans");
+  }
+
   for (const auto &Arg : ForwardedArguments)
     if (const auto *A = Args.getLastArg(Arg))
       CmdArgs.push_back(Args.MakeArgString(A->getOption().getPrefixedName()));
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index 813a0fb..f883f29 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -266,6 +266,10 @@ void tools::PS5cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (UseJMC)
     AddLTOFlag("-enable-jmc-instrument");
 
+  if (Args.hasFlag(options::OPT_fstack_size_section,
+                   options::OPT_fno_stack_size_section, false))
+    AddLTOFlag("-stack-size-section");
+
   if (Arg *A = Args.getLastArg(options::OPT_fcrash_diagnostics_dir))
     AddLTOFlag(Twine("-crash-diagnostics-dir=") + A->getValue());
 
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 920ddf7..17b9ca7 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1318,7 +1318,7 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   if (!LangOpts.MathErrno)
     Builder.defineMacro("__NO_MATH_ERRNO__");
 
-  if (LangOpts.FastMath || LangOpts.FiniteMathOnly)
+  if (LangOpts.FastMath || (LangOpts.NoHonorInfs && LangOpts.NoHonorNaNs))
     Builder.defineMacro("__FINITE_MATH_ONLY__", "1");
   else
     Builder.defineMacro("__FINITE_MATH_ONLY__", "0");
diff --git a/clang/lib/Frontend/PrintPreprocessedOutput.cpp b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
index 0592423..135dca0 100644
--- a/clang/lib/Frontend/PrintPreprocessedOutput.cpp
+++ b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
@@ -980,7 +980,6 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok,
         *Callbacks->OS << static_cast<unsigned>(*Iter);
         PrintComma = true;
       }
-      IsStartOfLine = true;
     } else if (Tok.isAnnotation()) {
       // Ignore annotation tokens created by pragmas - the pragmas themselves
       // will be reproduced in the preprocessed output.
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index 879f01e..1e31fcc 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -1604,6 +1604,16 @@ static bool isTargetVariantEnvironment(const TargetInfo &TI,
   return false;
 }
 
+#if defined(__sun__) && defined(__svr4__)
+// GCC mangles std::tm as tm for binary compatibility on Solaris (Issue
+// #33114).  We need to match this to allow the std::put_time calls to link
+// (PR #99075).
+asm("_ZNKSt8time_putIcSt19ostreambuf_iteratorIcSt11char_traitsIcEEE3putES3_"
+    "RSt8ios_basecPKSt2tmPKcSB_ = "
+    "_ZNKSt8time_putIcSt19ostreambuf_iteratorIcSt11char_traitsIcEEE3putES3_"
+    "RSt8ios_basecPK2tmPKcSB_");
+#endif
+
 /// ExpandBuiltinMacro - If an identifier token is read that is to be expanded
 /// as a builtin macro, handle it and return the next token as 'Tok'.
 void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 07a34fd..bb30b1e 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -1477,6 +1477,18 @@ static bool BuiltinSEHScopeCheck(Sema &SemaRef, CallExpr *TheCall,
   return false;
 }
 
+// In OpenCL, __builtin_alloca_* should return a pointer to address space
+// that corresponds to the stack address space i.e private address space.
+static void builtinAllocaAddrSpace(Sema &S, CallExpr *TheCall) {
+  QualType RT = TheCall->getType();
+  assert((RT->isPointerType() && !(RT->getPointeeType().hasAddressSpace())) &&
+         "__builtin_alloca has invalid address space");
+
+  RT = RT->getPointeeType();
+  RT = S.Context.getAddrSpaceQualType(RT, LangAS::opencl_private);
+  TheCall->setType(S.Context.getPointerType(RT));
+}
+
 namespace {
 enum PointerAuthOpKind {
   PAO_Strip,
@@ -2214,6 +2226,9 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
   case Builtin::BI__builtin_alloca_uninitialized:
     Diag(TheCall->getBeginLoc(), diag::warn_alloca)
         << TheCall->getDirectCallee();
+    if (getLangOpts().OpenCL) {
+      builtinAllocaAddrSpace(*this, TheCall);
+    }
     break;
   case Builtin::BI__arithmetic_fence:
     if (BuiltinArithmeticFence(TheCall))
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 575bd29..7bdecb2 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11009,6 +11009,9 @@ static bool AttrCompatibleWithMultiVersion(attr::Kind Kind,
   switch (Kind) {
   default:
     return false;
+  case attr::ArmLocallyStreaming:
+    return MVKind == MultiVersionKind::TargetVersion ||
+           MVKind == MultiVersionKind::TargetClones;
   case attr::Used:
     return MVKind == MultiVersionKind::Target;
   case attr::NonNull:
@@ -11145,7 +11148,21 @@ bool Sema::areMultiversionVariantFunctionsCompatible(
     FunctionType::ExtInfo OldTypeInfo = OldType->getExtInfo();
     FunctionType::ExtInfo NewTypeInfo = NewType->getExtInfo();
 
-    if (OldTypeInfo.getCC() != NewTypeInfo.getCC())
+    const auto *OldFPT = OldFD->getType()->getAs<FunctionProtoType>();
+    const auto *NewFPT = NewFD->getType()->getAs<FunctionProtoType>();
+
+    bool ArmStreamingCCMismatched = false;
+    if (OldFPT && NewFPT) {
+      unsigned Diff =
+          OldFPT->getAArch64SMEAttributes() ^ NewFPT->getAArch64SMEAttributes();
+      // Arm-streaming, arm-streaming-compatible and non-streaming versions
+      // cannot be mixed.
+      if (Diff & (FunctionType::SME_PStateSMEnabledMask |
+                  FunctionType::SME_PStateSMCompatibleMask))
+        ArmStreamingCCMismatched = true;
+    }
+
+    if (OldTypeInfo.getCC() != NewTypeInfo.getCC() || ArmStreamingCCMismatched)
       return Diag(DiffDiagIDAt.first, DiffDiagIDAt.second) << CallingConv;
 
     QualType OldReturnType = OldType->getReturnType();
@@ -11165,9 +11182,8 @@ bool Sema::areMultiversionVariantFunctionsCompatible(
     if (!CLinkageMayDiffer && OldFD->isExternC() != NewFD->isExternC())
       return Diag(DiffDiagIDAt.first, DiffDiagIDAt.second) << LanguageLinkage;
 
-    if (CheckEquivalentExceptionSpec(
-            OldFD->getType()->getAs<FunctionProtoType>(), OldFD->getLocation(),
-            NewFD->getType()->getAs<FunctionProtoType>(), NewFD->getLocation()))
+    if (CheckEquivalentExceptionSpec(OldFPT, OldFD->getLocation(), NewFPT,
+                                     NewFD->getLocation()))
       return true;
   }
   return false;
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 300bd89..8b0a6bf 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3034,9 +3034,6 @@ bool Sema::checkTargetVersionAttr(SourceLocation LiteralLoc, Decl *D,
       return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
              << Unsupported << None << CurFeature << TargetVersion;
   }
-  if (IsArmStreamingFunction(cast<FunctionDecl>(D),
-                             /*IncludeLocallyStreaming=*/false))
-    return Diag(LiteralLoc, diag::err_sme_streaming_cannot_be_multiversioned);
   return false;
 }
 
@@ -3133,10 +3130,6 @@ bool Sema::checkTargetClonesAttrString(
           HasNotDefault = true;
         }
       }
-      if (IsArmStreamingFunction(cast<FunctionDecl>(D),
-                                 /*IncludeLocallyStreaming=*/false))
-        return Diag(LiteralLoc,
-                    diag::err_sme_streaming_cannot_be_multiversioned);
     } else {
       // Other targets ( currently X86 )
       if (Cur.starts_with("arch=")) {
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 1cca8ac..5782daa 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -10385,7 +10385,7 @@ void Sema::checkIncorrectVTablePointerAuthenticationAttribute(
   while (1) {
     assert(PrimaryBase);
     const CXXRecordDecl *Base = nullptr;
-    for (auto BasePtr : PrimaryBase->bases()) {
+    for (const CXXBaseSpecifier &BasePtr : PrimaryBase->bases()) {
       if (!BasePtr.getType()->getAsCXXRecordDecl()->isDynamicClass())
         continue;
       Base = BasePtr.getType()->getAsCXXRecordDecl();
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 51c4a36..029969c 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -6030,6 +6030,32 @@ static bool EvaluateBinaryTypeTrait(Sema &Self, TypeTrait BTT, const TypeSourceI
     return cast<CXXRecordDecl>(rhsRecord->getDecl())
       ->isDerivedFrom(cast<CXXRecordDecl>(lhsRecord->getDecl()));
   }
+  case BTT_IsVirtualBaseOf: {
+    const RecordType *BaseRecord = LhsT->getAs<RecordType>();
+    const RecordType *DerivedRecord = RhsT->getAs<RecordType>();
+
+    if (!BaseRecord || !DerivedRecord) {
+      DiagnoseVLAInCXXTypeTrait(Self, Lhs,
+                                tok::kw___builtin_is_virtual_base_of);
+      DiagnoseVLAInCXXTypeTrait(Self, Rhs,
+                                tok::kw___builtin_is_virtual_base_of);
+      return false;
+    }
+
+    if (BaseRecord->isUnionType() || DerivedRecord->isUnionType())
+      return false;
+
+    if (!BaseRecord->isStructureOrClassType() ||
+        !DerivedRecord->isStructureOrClassType())
+      return false;
+
+    if (Self.RequireCompleteType(Rhs->getTypeLoc().getBeginLoc(), RhsT,
+                                 diag::err_incomplete_type))
+      return false;
+
+    return cast<CXXRecordDecl>(DerivedRecord->getDecl())
+        ->isVirtuallyDerivedFrom(cast<CXXRecordDecl>(BaseRecord->getDecl()));
+  }
   case BTT_IsSame:
     return Self.Context.hasSameType(LhsT, RhsT);
   case BTT_TypeCompatible: {
diff --git a/clang/lib/Sema/SemaPPC.cpp b/clang/lib/Sema/SemaPPC.cpp
index 99f46b1..5b764ed 100644
--- a/clang/lib/Sema/SemaPPC.cpp
+++ b/clang/lib/Sema/SemaPPC.cpp
@@ -93,7 +93,6 @@ bool SemaPPC::CheckPPCBuiltinFunctionCall(const TargetInfo &TI,
                                           unsigned BuiltinID,
                                           CallExpr *TheCall) {
   ASTContext &Context = getASTContext();
-  unsigned i = 0, l = 0, u = 0;
   bool IsTarget64Bit = TI.getTypeWidth(TI.getIntPtrType()) == 64;
   llvm::APSInt Result;
 
@@ -248,7 +247,7 @@ bool SemaPPC::CheckPPCBuiltinFunctionCall(const TargetInfo &TI,
     return BuiltinPPCMMACall(TheCall, BuiltinID, Types);
 #include "clang/Basic/BuiltinsPPC.def"
   }
-  return SemaRef.BuiltinConstantArgRange(TheCall, i, l, u);
+  llvm_unreachable("must return from switch");
 }
 
 // Check if the given type is a non-pointer PPC MMA type. This function is used
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index b7b857e..db7f233 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -951,9 +951,11 @@ private:
 
     // Skip over the pack elements that were expanded into separate arguments.
     // If we partially expanded, this is the number of partial arguments.
+    // FIXME: `&& FixedNumExpansions` is a workaround for UB described in
+    // https://github.com/llvm/llvm-project/issues/100095
     if (IsPartiallyExpanded)
       PackElements += NumPartialPackArgs;
-    else if (IsExpanded)
+    else if (IsExpanded && FixedNumExpansions)
       PackElements += *FixedNumExpansions;
 
     for (auto &Pack : Packs) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
index cd1dd1b2..4b8e521 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
@@ -21,49 +21,56 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerHelpers.h"
 
 using namespace clang;
 using namespace ento;
 
 namespace {
-class MmapWriteExecChecker : public Checker<check::PreCall> {
+class MmapWriteExecChecker
+    : public Checker<check::ASTDecl<TranslationUnitDecl>, check::PreCall> {
   CallDescription MmapFn{CDM::CLibrary, {"mmap"}, 6};
   CallDescription MprotectFn{CDM::CLibrary, {"mprotect"}, 3};
-  static int ProtWrite;
-  static int ProtExec;
-  static int ProtRead;
   const BugType BT{this, "W^X check fails, Write Exec prot flags set",
                    "Security"};
 
+  // Default values are used if definition of the flags is not found.
+  mutable int ProtRead = 0x01;
+  mutable int ProtWrite = 0x02;
+  mutable int ProtExec = 0x04;
+
 public:
+  void checkASTDecl(const TranslationUnitDecl *TU, AnalysisManager &Mgr,
+                    BugReporter &BR) const;
   void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
-  int ProtExecOv;
-  int ProtReadOv;
 };
 }
 
-int MmapWriteExecChecker::ProtWrite = 0x02;
-int MmapWriteExecChecker::ProtExec  = 0x04;
-int MmapWriteExecChecker::ProtRead  = 0x01;
+void MmapWriteExecChecker::checkASTDecl(const TranslationUnitDecl *TU,
+                                        AnalysisManager &Mgr,
+                                        BugReporter &BR) const {
+  Preprocessor &PP = Mgr.getPreprocessor();
+  const std::optional<int> FoundProtRead = tryExpandAsInteger("PROT_READ", PP);
+  const std::optional<int> FoundProtWrite =
+      tryExpandAsInteger("PROT_WRITE", PP);
+  const std::optional<int> FoundProtExec = tryExpandAsInteger("PROT_EXEC", PP);
+  if (FoundProtRead && FoundProtWrite && FoundProtExec) {
+    ProtRead = *FoundProtRead;
+    ProtWrite = *FoundProtWrite;
+    ProtExec = *FoundProtExec;
+  }
+}
 
 void MmapWriteExecChecker::checkPreCall(const CallEvent &Call,
-                                         CheckerContext &C) const {
+                                        CheckerContext &C) const {
   if (matchesAny(Call, MmapFn, MprotectFn)) {
     SVal ProtVal = Call.getArgSVal(2);
     auto ProtLoc = ProtVal.getAs<nonloc::ConcreteInt>();
     if (!ProtLoc)
       return;
     int64_t Prot = ProtLoc->getValue().getSExtValue();
-    if (ProtExecOv != ProtExec)
-      ProtExec = ProtExecOv;
-    if (ProtReadOv != ProtRead)
-      ProtRead = ProtReadOv;
-
-    // Wrong settings
-    if (ProtRead == ProtExec)
-      return;
 
-    if ((Prot & (ProtWrite | ProtExec)) == (ProtWrite | ProtExec)) {
+    if ((Prot & ProtWrite) && (Prot & ProtExec)) {
       ExplodedNode *N = C.generateNonFatalErrorNode();
       if (!N)
         return;
@@ -80,17 +87,10 @@ void MmapWriteExecChecker::checkPreCall(const CallEvent &Call,
   }
 }
 
-void ento::registerMmapWriteExecChecker(CheckerManager &mgr) {
-  MmapWriteExecChecker *Mwec =
-      mgr.registerChecker<MmapWriteExecChecker>();
-  Mwec->ProtExecOv =
-    mgr.getAnalyzerOptions()
-      .getCheckerIntegerOption(Mwec, "MmapProtExec");
-  Mwec->ProtReadOv =
-    mgr.getAnalyzerOptions()
-      .getCheckerIntegerOption(Mwec, "MmapProtRead");
+void ento::registerMmapWriteExecChecker(CheckerManager &Mgr) {
+  Mgr.registerChecker<MmapWriteExecChecker>();
 }
 
-bool ento::shouldRegisterMmapWriteExecChecker(const CheckerManager &mgr) {
+bool ento::shouldRegisterMmapWriteExecChecker(const CheckerManager &) {
   return true;
 }
diff --git a/clang/test/AST/Interp/builtins.cpp b/clang/test/AST/Interp/builtins.cpp
index a74b68b..9b2b207 100644
--- a/clang/test/AST/Interp/builtins.cpp
+++ b/clang/test/AST/Interp/builtins.cpp
@@ -31,3 +31,8 @@ constexpr bool assume() {
   return true;
 }
 static_assert(assume(), "");
+
+void test_builtin_os_log(void *buf, int i, const char *data) {
+  constexpr int len = __builtin_os_log_format_buffer_size("%d %{public}s %{private}.16P", i, data, data);
+  static_assert(len > 0, "Expect len > 0");
+}
diff --git a/clang/test/Analysis/analyzer-config.c b/clang/test/Analysis/analyzer-config.c
index 2a4c400..b8dbcdd 100644
--- a/clang/test/Analysis/analyzer-config.c
+++ b/clang/test/Analysis/analyzer-config.c
@@ -9,8 +9,6 @@
 // CHECK-NEXT: alpha.clone.CloneChecker:ReportNormalClones = true
 // CHECK-NEXT: alpha.cplusplus.STLAlgorithmModeling:AggressiveStdFindModeling = false
 // CHECK-NEXT: alpha.osx.cocoa.DirectIvarAssignment:AnnotatedFunctions = false
-// CHECK-NEXT: alpha.security.MmapWriteExec:MmapProtExec = 0x04
-// CHECK-NEXT: alpha.security.MmapWriteExec:MmapProtRead = 0x01
 // CHECK-NEXT: alpha.security.taint.TaintPropagation:Config = ""
 // CHECK-NEXT: apply-fixits = false
 // CHECK-NEXT: assume-controlled-environment = false
diff --git a/clang/test/Analysis/mmap-writeexec.c b/clang/test/Analysis/mmap-writeexec.c
index 8fd86ce..579cc75 100644
--- a/clang/test/Analysis/mmap-writeexec.c
+++ b/clang/test/Analysis/mmap-writeexec.c
@@ -1,13 +1,14 @@
-// RUN: %clang_analyze_cc1 -triple i686-unknown-linux -analyzer-checker=alpha.security.MmapWriteExec -analyzer-config alpha.security.MmapWriteExec:MmapProtExec=1 -analyzer-config alpha.security.MmapWriteExec:MmapProtRead=4 -DUSE_ALTERNATIVE_PROT_EXEC_DEFINITION -verify %s
+// RUN: %clang_analyze_cc1 -triple i686-unknown-linux -analyzer-checker=alpha.security.MmapWriteExec -DUSE_ALTERNATIVE_PROT_EXEC_DEFINITION -verify %s
 // RUN: %clang_analyze_cc1 -triple x86_64-unknown-apple-darwin10 -analyzer-checker=alpha.security.MmapWriteExec -verify %s
 
-#define PROT_WRITE  0x02
 #ifndef USE_ALTERNATIVE_PROT_EXEC_DEFINITION
-#define PROT_EXEC   0x04
-#define PROT_READ   0x01
-#else
 #define PROT_EXEC   0x01
+#define PROT_WRITE  0x02
 #define PROT_READ   0x04
+#else
+#define PROT_EXEC   0x08
+#define PROT_WRITE  0x04
+#define PROT_READ   0x02
 #endif
 #define MAP_PRIVATE 0x0002
 #define MAP_ANON    0x1000
diff --git a/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c b/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c
new file mode 100644
index 0000000..6deff11
--- /dev/null
+++ b/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c
@@ -0,0 +1,316 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -fnative-half-arguments-and-returns -triple amdgcn-amd-amdhsa-gnu -target-cpu gfx900 -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,SAFE %s
+// RUN: %clang_cc1 -fnative-half-arguments-and-returns -triple amdgcn-amd-amdhsa-gnu -target-cpu gfx900 -emit-llvm -munsafe-fp-atomics -o - %s | FileCheck -check-prefixes=CHECK,UNSAFE %s
+
+// SAFE-LABEL: define dso_local float @test_float_post_inc(
+// SAFE-SAME: ) #[[ATTR0:[0-9]+]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_float_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 4
+// SAFE-NEXT:    ret float [[TMP0]]
+//
+// UNSAFE-LABEL: define dso_local float @test_float_post_inc(
+// UNSAFE-SAME: ) #[[ATTR0:[0-9]+]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_float_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    ret float [[TMP0]]
+//
+float test_float_post_inc()
+{
+    static _Atomic float n;
+    return n++;
+}
+
+// SAFE-LABEL: define dso_local float @test_float_post_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_float_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 4
+// SAFE-NEXT:    ret float [[TMP0]]
+//
+// UNSAFE-LABEL: define dso_local float @test_float_post_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_float_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    ret float [[TMP0]]
+//
+float test_float_post_dc()
+{
+    static _Atomic float n;
+    return n--;
+}
+
+// SAFE-LABEL: define dso_local float @test_float_pre_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_float_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 4
+// SAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    ret float [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local float @test_float_pre_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_float_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    ret float [[TMP1]]
+//
+float test_float_pre_dc()
+{
+    static _Atomic float n;
+    return --n;
+}
+
+// SAFE-LABEL: define dso_local float @test_float_pre_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_float_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 4
+// SAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    ret float [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local float @test_float_pre_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_float_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    ret float [[TMP1]]
+//
+float test_float_pre_inc()
+{
+    static _Atomic float n;
+    return ++n;
+}
+
+// SAFE-LABEL: define dso_local double @test_double_post_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    ret double [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local double @test_double_post_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    ret double [[TMP1]]
+//
+double test_double_post_inc()
+{
+    static _Atomic double n;
+    return n++;
+}
+
+// SAFE-LABEL: define dso_local double @test_double_post_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    ret double [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local double @test_double_post_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    ret double [[TMP1]]
+//
+double test_double_post_dc()
+{
+    static _Atomic double n;
+    return n--;
+}
+
+// SAFE-LABEL: define dso_local double @test_double_pre_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    ret double [[TMP2]]
+//
+// UNSAFE-LABEL: define dso_local double @test_double_pre_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    ret double [[TMP2]]
+//
+double test_double_pre_dc()
+{
+    static _Atomic double n;
+    return --n;
+}
+
+// SAFE-LABEL: define dso_local double @test_double_pre_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    ret double [[TMP2]]
+//
+// UNSAFE-LABEL: define dso_local double @test_double_pre_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    ret double [[TMP2]]
+//
+double test_double_pre_inc()
+{
+    static _Atomic double n;
+    return ++n;
+}
+
+// SAFE-LABEL: define dso_local half @test__Float16_post_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 2
+// SAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    ret half [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local half @test__Float16_post_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    ret half [[TMP1]]
+//
+_Float16 test__Float16_post_inc()
+{
+    static _Atomic _Float16 n;
+    return n++;
+}
+
+// SAFE-LABEL: define dso_local half @test__Float16_post_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 2
+// SAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    ret half [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local half @test__Float16_post_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    ret half [[TMP1]]
+//
+_Float16 test__Float16_post_dc()
+{
+    static _Atomic _Float16 n;
+    return n--;
+}
+
+// SAFE-LABEL: define dso_local half @test__Float16_pre_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 2
+// SAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    ret half [[TMP2]]
+//
+// UNSAFE-LABEL: define dso_local half @test__Float16_pre_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    ret half [[TMP2]]
+//
+_Float16 test__Float16_pre_dc()
+{
+    static _Atomic _Float16 n;
+    return --n;
+}
+
+// SAFE-LABEL: define dso_local half @test__Float16_pre_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 2
+// SAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    ret half [[TMP2]]
+//
+// UNSAFE-LABEL: define dso_local half @test__Float16_pre_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    ret half [[TMP2]]
+//
+_Float16 test__Float16_pre_inc()
+{
+    static _Atomic _Float16 n;
+    return ++n;
+}
+//.
+// UNSAFE: [[META3]] = !{}
+//.
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// CHECK: {{.*}}
diff --git a/clang/test/CodeGen/aarch64-fmv-streaming.c b/clang/test/CodeGen/aarch64-fmv-streaming.c
new file mode 100644
index 0000000..e549ccd
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-fmv-streaming.c
@@ -0,0 +1,107 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -emit-llvm -o - %s | FileCheck %s
+
+
+// CHECK-LABEL: define {{[^@]+}}@n_callee._Msve
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+//
+// CHECK-LABEL: define {{[^@]+}}@n_callee._Msimd
+// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+//
+__arm_locally_streaming __attribute__((target_clones("sve", "simd"))) void n_callee(void) {}
+// CHECK-LABEL: define {{[^@]+}}@n_callee._Msme2
+// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+//
+__attribute__((target_version("sme2"))) void n_callee(void) {}
+// CHECK-LABEL: define {{[^@]+}}@n_callee.default
+// CHECK-SAME: () #[[ATTR3:[0-9]+]] {
+//
+__attribute__((target_version("default"))) void n_callee(void) {}
+
+
+// CHECK-LABEL: define {{[^@]+}}@s_callee._Msve
+// CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+//
+// CHECK-LABEL: define {{[^@]+}}@s_callee._Msimd
+// CHECK-SAME: () #[[ATTR5:[0-9]+]] {
+//
+__attribute__((target_clones("sve", "simd"))) void s_callee(void) __arm_streaming {}
+// CHECK-LABEL: define {{[^@]+}}@s_callee._Msme2
+// CHECK-SAME: () #[[ATTR6:[0-9]+]] {
+//
+__arm_locally_streaming __attribute__((target_version("sme2"))) void s_callee(void) __arm_streaming {}
+// CHECK-LABEL: define {{[^@]+}}@s_callee.default
+// CHECK-SAME: () #[[ATTR7:[0-9]+]] {
+//
+__attribute__((target_version("default"))) void s_callee(void) __arm_streaming {}
+
+
+// CHECK-LABEL: define {{[^@]+}}@sc_callee._Msve
+// CHECK-SAME: () #[[ATTR8:[0-9]+]] {
+//
+// CHECK-LABEL: define {{[^@]+}}@sc_callee._Msimd
+// CHECK-SAME: () #[[ATTR9:[0-9]+]] {
+//
+__attribute__((target_clones("sve", "simd"))) void sc_callee(void) __arm_streaming_compatible {}
+// CHECK-LABEL: define {{[^@]+}}@sc_callee._Msme2
+// CHECK-SAME: () #[[ATTR10:[0-9]+]] {
+//
+__arm_locally_streaming __attribute__((target_version("sme2"))) void sc_callee(void) __arm_streaming_compatible {}
+// CHECK-LABEL: define {{[^@]+}}@sc_callee.default
+// CHECK-SAME: () #[[ATTR11:[0-9]+]] {
+//
+__attribute__((target_version("default"))) void sc_callee(void) __arm_streaming_compatible {}
+
+
+// CHECK-LABEL: define {{[^@]+}}@n_caller
+// CHECK-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK:    call void @n_callee()
+// CHECK:    call void @s_callee() #[[ATTR12:[0-9]+]]
+// CHECK:    call void @sc_callee() #[[ATTR13:[0-9]+]]
+//
+void n_caller(void) {
+  n_callee();
+  s_callee();
+  sc_callee();
+}
+
+
+// CHECK-LABEL: define {{[^@]+}}@s_caller
+// CHECK-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK:    call void @n_callee()
+// CHECK:    call void @s_callee() #[[ATTR12]]
+// CHECK:    call void @sc_callee() #[[ATTR13]]
+//
+void s_caller(void) __arm_streaming {
+  n_callee();
+  s_callee();
+  sc_callee();
+}
+
+
+// CHECK-LABEL: define {{[^@]+}}@sc_caller
+// CHECK-SAME: () #[[ATTR11:[0-9]+]] {
+// CHECK:    call void @n_callee()
+// CHECK:    call void @s_callee() #[[ATTR12]]
+// CHECK:    call void @sc_callee() #[[ATTR13]]
+//
+void sc_caller(void) __arm_streaming_compatible {
+  n_callee();
+  s_callee();
+  sc_callee();
+}
+
+
+// CHECK: attributes #[[ATTR0:[0-9]+]] = {{.*}} "aarch64_pstate_sm_body"
+// CHECK: attributes #[[ATTR1:[0-9]+]] = {{.*}} "aarch64_pstate_sm_body"
+// CHECK: attributes #[[ATTR2:[0-9]+]] = {{.*}}
+// CHECK: attributes #[[ATTR3]] = {{.*}}
+// CHECK: attributes #[[ATTR4:[0-9]+]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR5:[0-9]+]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR6:[0-9]+]] = {{.*}} "aarch64_pstate_sm_body" "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR7]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR8:[0-9]+]] = {{.*}} "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[ATTR9:[0-9]+]] = {{.*}} "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[ATTR10]] = {{.*}} "aarch64_pstate_sm_body" "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[ATTR11]] = {{.*}} "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[ATTR12]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR13]] = {{.*}} "aarch64_pstate_sm_compatible"
diff --git a/clang/test/CodeGen/finite-math.c b/clang/test/CodeGen/finite-math.c
index d1a2956..9cddba99d 100644
--- a/clang/test/CodeGen/finite-math.c
+++ b/clang/test/CodeGen/finite-math.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -ffinite-math-only -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=FINITE
+// RUN: %clang_cc1 -menable-no-infs -menable-no-nans -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=FINITE
 // RUN: %clang_cc1 -fno-signed-zeros -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK  -check-prefix=NSZ
 // RUN: %clang_cc1 -freciprocal-math -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK  -check-prefix=RECIP
 // RUN: %clang_cc1 -mreassociate -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK  -check-prefix=REASSOC
diff --git a/clang/test/CodeGen/fp-floatcontrol-stack.cpp b/clang/test/CodeGen/fp-floatcontrol-stack.cpp
index 090da25..237c9d4 100644
--- a/clang/test/CodeGen/fp-floatcontrol-stack.cpp
+++ b/clang/test/CodeGen/fp-floatcontrol-stack.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -ffp-contract=on -DDEFAULT=1 -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-DDEFAULT %s
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -ffp-contract=on -DEBSTRICT=1 -ffp-exception-behavior=strict -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-DEBSTRICT %s
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -DFAST=1 -ffast-math -ffp-contract=fast -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-FAST %s
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -ffp-contract=on -DNOHONOR=1 -menable-no-infs -menable-no-nans -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-NOHONOR %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -ffp-contract=on -DNOHONOR=1 -ffinite-math-only -menable-no-infs -menable-no-nans -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-NOHONOR %s
 
 #define FUN(n) \
   (float z) { return n * z + n; }
diff --git a/clang/test/CodeGen/fp-options-to-fast-math-flags.c b/clang/test/CodeGen/fp-options-to-fast-math-flags.c
index abdcf85..6aa6226 100644
--- a/clang/test/CodeGen/fp-options-to-fast-math-flags.c
+++ b/clang/test/CodeGen/fp-options-to-fast-math-flags.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck -check-prefix CHECK-PRECISE %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -menable-no-nans -emit-llvm -o - %s | FileCheck -check-prefix CHECK-NO-NANS %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -menable-no-infs -emit-llvm -o - %s | FileCheck -check-prefix CHECK-NO-INFS %s
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ffinite-math-only -emit-llvm -o - %s | FileCheck -check-prefix CHECK-FINITE %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -menable-no-infs -menable-no-nans -emit-llvm -o - %s | FileCheck -check-prefix CHECK-FINITE %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -fno-signed-zeros -emit-llvm -o - %s | FileCheck -check-prefix CHECK-NO-SIGNED-ZEROS %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -mreassociate -emit-llvm -o - %s | FileCheck -check-prefix CHECK-REASSOC %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -freciprocal-math -emit-llvm -o - %s | FileCheck -check-prefix CHECK-RECIP %s
diff --git a/clang/test/CodeGen/nofpclass.c b/clang/test/CodeGen/nofpclass.c
index fc4c64f..2347091 100644
--- a/clang/test/CodeGen/nofpclass.c
+++ b/clang/test/CodeGen/nofpclass.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --version 2
 // REQUIRES: x86-registered-target
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -ffinite-math-only -emit-llvm -o - %s | FileCheck -check-prefixes=CFINITEONLY %s
-// RUN: %clang_cc1 -x cl -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -cl-finite-math-only -emit-llvm -o - %s | FileCheck -check-prefixes=CLFINITEONLY %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -menable-no-infs -menable-no-nans -emit-llvm -o - %s | FileCheck -check-prefixes=CFINITEONLY %s
+// RUN: %clang_cc1 -x cl -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -menable-no-nans -menable-no-infs -emit-llvm -o - %s | FileCheck -check-prefixes=CLFINITEONLY %s
 
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -menable-no-nans -emit-llvm -o - %s | FileCheck -check-prefixes=NONANS %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -menable-no-infs -emit-llvm -o - %s | FileCheck -check-prefixes=NOINFS %s
diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
index eeb23bc..55ddb52 100644
--- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
@@ -1,6 +1,10 @@
 // RUN: %clang_cc1 -x hip %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa \
 // RUN:   -fcuda-is-device -target-cpu gfx906 -fnative-half-type \
-// RUN:   -fnative-half-arguments-and-returns | FileCheck %s
+// RUN:   -fnative-half-arguments-and-returns | FileCheck -check-prefixes=CHECK,SAFEIR %s
+
+// RUN: %clang_cc1 -x hip %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa \
+// RUN:   -fcuda-is-device -target-cpu gfx906 -fnative-half-type \
+// RUN:   -fnative-half-arguments-and-returns -munsafe-fp-atomics | FileCheck -check-prefixes=CHECK,UNSAFEIR %s
 
 // RUN: %clang_cc1 -x hip %s -O3 -S -o - -triple=amdgcn-amd-amdhsa \
 // RUN:   -fcuda-is-device -target-cpu gfx1100 -fnative-half-type \
@@ -18,24 +22,38 @@
 
 __global__ void ffp1(float *p) {
   // CHECK-LABEL: @_Z4ffp1Pf
-  // CHECK: atomicrmw fadd ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
+  // SAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 4{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 4{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 4{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4{{$}}
+
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
   // SAFE: _Z4ffp1Pf
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
+  // SAFE: global_atomic_cmpswap
+
   // UNSAFE: _Z4ffp1Pf
   // UNSAFE: global_atomic_add_f32
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
+  // UNSAFE: global_atomic_cmpswap
+
   __atomic_fetch_add(p, 1.0f, memory_order_relaxed);
+  __atomic_fetch_sub(p, 1.0f, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0f, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0f, memory_order_relaxed);
   __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
@@ -44,23 +62,36 @@ __global__ void ffp1(float *p) {
 
 __global__ void ffp2(double *p) {
   // CHECK-LABEL: @_Z4ffp2Pd
-  // CHECK: atomicrmw fsub ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
+  // SAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}}
+
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
   // SAFE-LABEL: @_Z4ffp2Pd
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
+  // SAFE: global_atomic_cmpswap_b64
+
   // UNSAFE-LABEL: @_Z4ffp2Pd
+  // UNSAFE: global_atomic_add_f64
   // UNSAFE: global_atomic_cmpswap_x2
   // UNSAFE: global_atomic_cmpswap_x2
   // UNSAFE: global_atomic_cmpswap_x2
   // UNSAFE: global_atomic_max_f64
   // UNSAFE: global_atomic_min_f64
+  __atomic_fetch_add(p, 1.0, memory_order_relaxed);
   __atomic_fetch_sub(p, 1.0, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0, memory_order_relaxed);
@@ -71,11 +102,20 @@ __global__ void ffp2(double *p) {
 // long double is the same as double for amdgcn.
 __global__ void ffp3(long double *p) {
   // CHECK-LABEL: @_Z4ffp3Pe
-  // CHECK: atomicrmw fsub ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
+  // SAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}}
+
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
   // SAFE-LABEL: @_Z4ffp3Pe
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
@@ -88,6 +128,7 @@ __global__ void ffp3(long double *p) {
   // UNSAFE: global_atomic_cmpswap_x2
   // UNSAFE: global_atomic_max_f64
   // UNSAFE: global_atomic_min_f64
+  __atomic_fetch_add(p, 1.0L, memory_order_relaxed);
   __atomic_fetch_sub(p, 1.0L, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0L, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0L, memory_order_relaxed);
@@ -98,37 +139,52 @@ __global__ void ffp3(long double *p) {
 __device__ double ffp4(double *p, float f) {
   // CHECK-LABEL: @_Z4ffp4Pdf
   // CHECK: fpext float {{.*}} to double
-  // CHECK: atomicrmw fsub ptr {{.*}} monotonic
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   return __atomic_fetch_sub(p, f, memory_order_relaxed);
 }
 
 __device__ double ffp5(double *p, int i) {
   // CHECK-LABEL: @_Z4ffp5Pdi
   // CHECK: sitofp i32 {{.*}} to double
-  // CHECK: atomicrmw fsub ptr {{.*}} monotonic
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   return __atomic_fetch_sub(p, i, memory_order_relaxed);
 }
 
 __global__ void ffp6(_Float16 *p) {
   // CHECK-LABEL: @_Z4ffp6PDF16
-  // CHECK: atomicrmw fadd ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
+  // SAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 2{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 2{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 2{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2{{$}}
+
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
   // SAFE: _Z4ffp6PDF16
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
+  // SAFE: global_atomic_cmpswap
+
   // UNSAFE: _Z4ffp6PDF16
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
+  // UNSAFE: global_atomic_cmpswap
   __atomic_fetch_add(p, 1.0, memory_order_relaxed);
+  __atomic_fetch_sub(p, 1.0, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0, memory_order_relaxed);
   __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
diff --git a/clang/test/CodeGenOpenCL/builtins-alloca.cl b/clang/test/CodeGenOpenCL/builtins-alloca.cl
new file mode 100644
index 0000000..474e95e
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-alloca.cl
@@ -0,0 +1,141 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL1.2 \
+// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL2.0 \
+// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL3.0 \
+// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL3.0 -cl-ext=+__opencl_c_generic_address_space \
+// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
+
+// OPENCL-LABEL: define dso_local void @test1_builtin_alloca(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test1_builtin_alloca(unsigned n) {
+    __private float* alloc_ptr = (__private float*)__builtin_alloca(n*sizeof(int));
+}
+
+// OPENCL-LABEL: define dso_local void @test1_builtin_alloca_uninitialized(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test1_builtin_alloca_uninitialized(unsigned n) {
+    __private float* alloc_ptr_uninitialized = (__private float*)__builtin_alloca_uninitialized(n*sizeof(int));
+}
+
+// OPENCL-LABEL: define dso_local void @test1_builtin_alloca_with_align(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test1_builtin_alloca_with_align(unsigned n) {
+    __private float* alloc_ptr_align = (__private float*)__builtin_alloca_with_align((n*sizeof(int)), 8);
+}
+
+// OPENCL-LABEL: define dso_local void @test1_builtin_alloca_with_align_uninitialized(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test1_builtin_alloca_with_align_uninitialized(unsigned n) {
+    __private float* alloc_ptr_align_uninitialized = (__private float*)__builtin_alloca_with_align_uninitialized((n*sizeof(int)), 8);
+}
+
+// OPENCL-LABEL: define dso_local void @test2_builtin_alloca(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test2_builtin_alloca(unsigned n) {
+    __private void *alloc_ptr = __builtin_alloca(n);
+}
+
+// OPENCL-LABEL: define dso_local void @test2_builtin_alloca_uninitialized(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test2_builtin_alloca_uninitialized(unsigned n) {
+    __private void *alloc_ptr_uninitialized = __builtin_alloca_uninitialized(n);
+}
+
+// OPENCL-LABEL: define dso_local void @test2_builtin_alloca_with_align(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test2_builtin_alloca_with_align(unsigned n) {
+    __private void *alloc_ptr_align = __builtin_alloca_with_align(n, 8);
+}
+
+// OPENCL-LABEL: define dso_local void @test2_builtin_alloca_with_align_uninitialized(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test2_builtin_alloca_with_align_uninitialized(unsigned n) {
+    __private void *alloc_ptr_align_uninitialized = __builtin_alloca_with_align_uninitialized(n, 8);
+}
diff --git a/clang/test/CodeGenOpenCL/relaxed-fpmath.cl b/clang/test/CodeGenOpenCL/relaxed-fpmath.cl
index 2751caa..a5f0019 100644
--- a/clang/test/CodeGenOpenCL/relaxed-fpmath.cl
+++ b/clang/test/CodeGenOpenCL/relaxed-fpmath.cl
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s -check-prefix=NORMAL
 // RUN: %clang_cc1 %s -emit-llvm -cl-fast-relaxed-math -o - | FileCheck %s -check-prefix=FAST
-// RUN: %clang_cc1 %s -emit-llvm -cl-finite-math-only -o - | FileCheck %s -check-prefix=FINITE
+// RUN: %clang_cc1 %s -emit-llvm -menable-no-infs -menable-no-nans -cl-finite-math-only -o - | FileCheck %s -check-prefix=FINITE
 // RUN: %clang_cc1 %s -emit-llvm -cl-unsafe-math-optimizations -o - | FileCheck %s -check-prefix=UNSAFE
 // RUN: %clang_cc1 %s -emit-llvm -cl-mad-enable -o - | FileCheck %s -check-prefix=MAD
 // RUN: %clang_cc1 %s -emit-llvm -cl-no-signed-zeros -o - | FileCheck %s -check-prefix=NOSIGNED
@@ -9,7 +9,7 @@
 // RUN: %clang_cc1 %s -DGEN_PCH=1 -finclude-default-header -triple spir-unknown-unknown -emit-pch -o %t.pch
 // RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -o - | FileCheck %s -check-prefix=NORMAL
 // RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -cl-fast-relaxed-math -o - | FileCheck %s -check-prefix=FAST
-// RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -cl-finite-math-only -o - | FileCheck %s -check-prefix=FINITE
+// RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -menable-no-infs -menable-no-nans -cl-finite-math-only -o - | FileCheck %s -check-prefix=FINITE
 // RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -cl-unsafe-math-optimizations -o - | FileCheck %s -check-prefix=UNSAFE
 // RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -cl-mad-enable -o - | FileCheck %s -check-prefix=MAD
 // RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -cl-no-signed-zeros -o - | FileCheck %s -check-prefix=NOSIGNED
diff --git a/clang/test/Driver/amdgpu-toolchain.c b/clang/test/Driver/amdgpu-toolchain.c
index 8ab6a07..ebd1158 100644
--- a/clang/test/Driver/amdgpu-toolchain.c
+++ b/clang/test/Driver/amdgpu-toolchain.c
@@ -28,3 +28,7 @@
 // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
 // RUN:   -fuse-ld=ld %s 2>&1 | FileCheck -check-prefixes=LD %s
 // LD: ld.lld
+
+// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
+// RUN:   -r %s 2>&1 | FileCheck -check-prefixes=RELO %s
+// RELO-NOT: -shared
diff --git a/clang/test/Driver/opencl.cl b/clang/test/Driver/opencl.cl
index aba37fc..3b0b191 100644
--- a/clang/test/Driver/opencl.cl
+++ b/clang/test/Driver/opencl.cl
@@ -35,7 +35,7 @@
 // CHECK-OPT-DISABLE: "-cc1" {{.*}} "-cl-opt-disable"
 // CHECK-STRICT-ALIASING: "-cc1" {{.*}} "-cl-strict-aliasing"
 // CHECK-SINGLE-PRECISION-CONST: "-cc1" {{.*}} "-cl-single-precision-constant"
-// CHECK-FINITE-MATH-ONLY: "-cc1" {{.*}} "-cl-finite-math-only"
+// CHECK-FINITE-MATH-ONLY: "-cc1" {{.*}} "-menable-no-infs" "-menable-no-nans" "-cl-finite-math-only"
 // CHECK-KERNEL-ARG-INFO: "-cc1" {{.*}} "-cl-kernel-arg-info"
 // CHECK-UNSAFE-MATH-OPT: "-cc1" {{.*}} "-cl-unsafe-math-optimizations"
 // CHECK-FAST-RELAXED-MATH: "-cc1" {{.*}} "-cl-fast-relaxed-math"
diff --git a/clang/test/Driver/stack-size-section.c b/clang/test/Driver/stack-size-section.c
index 71b9f85..7cd41e4 100644
--- a/clang/test/Driver/stack-size-section.c
+++ b/clang/test/Driver/stack-size-section.c
@@ -14,6 +14,7 @@
 
 // RUN: %clang -### --target=x86_64-linux-gnu -flto -fstack-size-section %s 2>&1 | FileCheck %s --check-prefix=LTO
 // RUN: %clang -### --target=x86_64-linux-gnu -flto -fstack-size-section -fno-stack-size-section %s 2>&1 | FileCheck %s --check-prefix=LTO-NO
+// RUN: %clang -### --target=x86_64-sie-ps5 -fstack-size-section %s 2>&1 | FileCheck %s --check-prefix=LTO
 
 // LTO: "-plugin-opt=-stack-size-section"
 // LTO-NO-NOT: "-plugin-opt=-stack-size-section"
diff --git a/clang/test/Headers/__clang_hip_cmath.hip b/clang/test/Headers/__clang_hip_cmath.hip
index cd085fd..ed1030b 100644
--- a/clang/test/Headers/__clang_hip_cmath.hip
+++ b/clang/test/Headers/__clang_hip_cmath.hip
@@ -13,7 +13,8 @@
 // RUN:   -internal-isystem %S/../../lib/Headers/cuda_wrappers \
 // RUN:   -internal-isystem %S/Inputs/include \
 // RUN:   -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-unknown \
-// RUN:   -target-cpu gfx906 -emit-llvm %s -fcuda-is-device -O1 -ffinite-math-only -o - \
+// RUN:   -target-cpu gfx906 -emit-llvm %s -fcuda-is-device -O1 -menable-no-infs \
+// RUN:   -menable-no-nans -o - \
 // RUN:   -D__HIPCC_RTC__ | FileCheck -check-prefix=FINITEONLY %s
 
 // DEFAULT-LABEL: @test_fma_f16(
diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index 26da828..6ee1097 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -14,7 +14,8 @@
 // RUN:   -internal-isystem %S/../../lib/Headers/cuda_wrappers \
 // RUN:   -internal-isystem %S/Inputs/include \
 // RUN:   -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-unknown \
-// RUN:   -target-cpu gfx906 -emit-llvm %s -fcuda-is-device -O1 -ffinite-math-only -o - \
+// RUN:   -target-cpu gfx906 -emit-llvm %s -fcuda-is-device -O1 -menable-no-infs \
+// RUN:   -menable-no-nans -o - \
 // RUN:   -D__HIPCC_RTC__ | FileCheck -check-prefixes=CHECK,FINITEONLY %s
 
 // Check that we end up with -fapprox-func set on intrinsic calls
diff --git a/clang/test/Headers/float.c b/clang/test/Headers/float.c
index 051c505..a3dd9c9 100644
--- a/clang/test/Headers/float.c
+++ b/clang/test/Headers/float.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c99 -ffreestanding %s
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c11 -ffreestanding %s
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c23 -ffreestanding %s
-// RUN: %clang_cc1 -fsyntax-only -verify=finite -std=c23 -ffreestanding -ffinite-math-only %s
+// RUN: %clang_cc1 -fsyntax-only -verify=finite -std=c23 -ffreestanding -menable-no-nans -menable-no-infs %s
 // RUN: %clang_cc1 -fsyntax-only -verify -xc++ -std=c++11 -ffreestanding %s
 // RUN: %clang_cc1 -fsyntax-only -verify -xc++ -std=c++14 -ffreestanding %s
 // RUN: %clang_cc1 -fsyntax-only -verify -xc++ -std=c++17 -ffreestanding %s
diff --git a/clang/test/OpenMP/amdgpu-unsafe-fp-atomics.cpp b/clang/test/OpenMP/amdgpu-unsafe-fp-atomics.cpp
new file mode 100644
index 0000000..7a34113
--- /dev/null
+++ b/clang/test/OpenMP/amdgpu-unsafe-fp-atomics.cpp
@@ -0,0 +1,59 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck -check-prefix=DEFAULT %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -munsafe-fp-atomics -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck -check-prefix=UNSAFE-FP-ATOMICS %s
+
+#pragma omp declare target
+
+float fv, fx;
+double dv, dx;
+
+// DEFAULT-LABEL: define hidden void @_Z15atomic_fadd_f32v(
+// DEFAULT-SAME: ) #[[ATTR0:[0-9]+]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspacecast (ptr addrspace(1) @fv to ptr), align 4
+// DEFAULT-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @fx to ptr), float [[TMP0]] monotonic, align 4
+// DEFAULT-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[TMP0]]
+// DEFAULT-NEXT:    store float [[ADD]], ptr addrspacecast (ptr addrspace(1) @fv to ptr), align 4
+// DEFAULT-NEXT:    ret void
+//
+// UNSAFE-FP-ATOMICS-LABEL: define hidden void @_Z15atomic_fadd_f32v(
+// UNSAFE-FP-ATOMICS-SAME: ) #[[ATTR0:[0-9]+]] {
+// UNSAFE-FP-ATOMICS-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-FP-ATOMICS-NEXT:    [[TMP0:%.*]] = load float, ptr addrspacecast (ptr addrspace(1) @fv to ptr), align 4
+// UNSAFE-FP-ATOMICS-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @fx to ptr), float [[TMP0]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META5:![0-9]+]], !amdgpu.ignore.denormal.mode [[META5]]
+// UNSAFE-FP-ATOMICS-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[TMP0]]
+// UNSAFE-FP-ATOMICS-NEXT:    store float [[ADD]], ptr addrspacecast (ptr addrspace(1) @fv to ptr), align 4
+// UNSAFE-FP-ATOMICS-NEXT:    ret void
+//
+void atomic_fadd_f32() {
+#pragma omp atomic capture
+  fv = fx = fx + fv;
+}
+
+// DEFAULT-LABEL: define hidden void @_Z15atomic_fadd_f64v(
+// DEFAULT-SAME: ) #[[ATTR0]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspacecast (ptr addrspace(1) @dv to ptr), align 8
+// DEFAULT-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @dx to ptr), double [[TMP0]] monotonic, align 8
+// DEFAULT-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[TMP0]]
+// DEFAULT-NEXT:    store double [[ADD]], ptr addrspacecast (ptr addrspace(1) @dv to ptr), align 8
+// DEFAULT-NEXT:    ret void
+//
+// UNSAFE-FP-ATOMICS-LABEL: define hidden void @_Z15atomic_fadd_f64v(
+// UNSAFE-FP-ATOMICS-SAME: ) #[[ATTR0]] {
+// UNSAFE-FP-ATOMICS-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-FP-ATOMICS-NEXT:    [[TMP0:%.*]] = load double, ptr addrspacecast (ptr addrspace(1) @dv to ptr), align 8
+// UNSAFE-FP-ATOMICS-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @dx to ptr), double [[TMP0]] monotonic, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// UNSAFE-FP-ATOMICS-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[TMP0]]
+// UNSAFE-FP-ATOMICS-NEXT:    store double [[ADD]], ptr addrspacecast (ptr addrspace(1) @dv to ptr), align 8
+// UNSAFE-FP-ATOMICS-NEXT:    ret void
+//
+void atomic_fadd_f64() {
+#pragma omp atomic capture
+  dv = dx = dx + dv;
+}
+
+#pragma omp end declare target
+//.
+// UNSAFE-FP-ATOMICS: [[META5]] = !{}
+//.
diff --git a/clang/test/Parser/namelookup-anonymous-struct.c b/clang/test/Parser/namelookup-anonymous-struct.c
new file mode 100644
index 0000000..cb691c2
--- /dev/null
+++ b/clang/test/Parser/namelookup-anonymous-struct.c
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -std=c11 -verify %s
+
+struct GH31295 {
+  struct { int x; };
+  int arr[sizeof(x)]; // expected-error{{use of undeclared identifier 'x'}}
+};
diff --git a/clang/test/Preprocessor/predefined-macros.c b/clang/test/Preprocessor/predefined-macros.c
index 7f036bf..633ba468 100644
--- a/clang/test/Preprocessor/predefined-macros.c
+++ b/clang/test/Preprocessor/predefined-macros.c
@@ -70,7 +70,7 @@
 // RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-NO-MATH-ERRNO
 // CHECK-NO-MATH-ERRNO: #define __NO_MATH_ERRNO__ 1
 //
-// RUN: %clang_cc1 %s -E -dM -ffinite-math-only -o - \
+// RUN: %clang_cc1 %s -E -dM -menable-no-nans -menable-no-infs -o - \
 // RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-FINITE-MATH-ONLY
 // CHECK-FINITE-MATH-ONLY: #define __FINITE_MATH_ONLY__ 1
 //
@@ -316,4 +316,4 @@
 // RUN:  -triple amdgcn-amd-amdhsa -fcuda-is-device | FileCheck -match-full-lines \
 // RUN:  %s --check-prefix=CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG
 // CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG: #define __HIPSTDPAR__ 1
-// CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG-NOT: #define __HIPSTDPAR_INTERPOSE_ALLOC__ 1
-\ No newline at end of file
+// CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG-NOT: #define __HIPSTDPAR_INTERPOSE_ALLOC__ 1
diff --git a/clang/test/Sema/aarch64-fmv-streaming.c b/clang/test/Sema/aarch64-fmv-streaming.c
new file mode 100644
index 0000000..93b7656
--- /dev/null
+++ b/clang/test/Sema/aarch64-fmv-streaming.c
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -Waarch64-sme-attributes -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -Waarch64-sme-attributes -fsyntax-only -verify=expected-cpp -x c++ %s
+
+__attribute__((target_clones("sve", "simd"))) void ok_arm_streaming(void) __arm_streaming {}
+__arm_locally_streaming __attribute__((target_version("sme2"))) void ok_arm_streaming(void) __arm_streaming {}
+__attribute__((target_version("default"))) void ok_arm_streaming(void) __arm_streaming {}
+
+__attribute__((target_clones("sve", "simd"))) void ok_arm_streaming_compatible(void) __arm_streaming_compatible {}
+__arm_locally_streaming __attribute__((target_version("sme2"))) void ok_arm_streaming_compatible(void) __arm_streaming_compatible {}
+__attribute__((target_version("default"))) void ok_arm_streaming_compatible(void) __arm_streaming_compatible {}
+
+__arm_locally_streaming __attribute__((target_clones("sve", "simd"))) void ok_no_streaming(void) {}
+__attribute__((target_version("sme2"))) void ok_no_streaming(void) {}
+__attribute__((target_version("default"))) void ok_no_streaming(void) {}
+
+__attribute__((target_clones("sve", "simd"))) void bad_mixed_streaming(void) {}
+// expected-cpp-error@+2 {{multiversioned function declaration has a different calling convention}}
+// expected-error@+1 {{multiversioned function declaration has a different calling convention}}
+__attribute__((target_version("sme2"))) void bad_mixed_streaming(void) __arm_streaming {}
+// expected-cpp-error@+2 {{multiversioned function declaration has a different calling convention}}
+// expected-error@+1 {{multiversioned function declaration has a different calling convention}}
+__attribute__((target_version("default"))) void bad_mixed_streaming(void) __arm_streaming_compatible {}
+// expected-cpp-error@+2 {{multiversioned function declaration has a different calling convention}}
+// expected-error@+1 {{multiversioned function declaration has a different calling convention}}
+__arm_locally_streaming __attribute__((target_version("dotprod"))) void bad_mixed_streaming(void) __arm_streaming {}
+
+void n_caller(void) {
+  ok_arm_streaming();
+  ok_arm_streaming_compatible();
+  ok_no_streaming();
+  bad_mixed_streaming();
+}
+
+void s_caller(void) __arm_streaming {
+  ok_arm_streaming();
+  ok_arm_streaming_compatible();
+  ok_no_streaming();
+  bad_mixed_streaming();
+}
+
+void sc_caller(void) __arm_streaming_compatible {
+  ok_arm_streaming();
+  ok_arm_streaming_compatible();
+  ok_no_streaming();
+  bad_mixed_streaming();
+}
diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c
index 6db39d6..0c263eb 100644
--- a/clang/test/Sema/aarch64-sme-func-attrs.c
+++ b/clang/test/Sema/aarch64-sme-func-attrs.c
@@ -455,48 +455,6 @@ void unimplemented_spill_fill_za(void (*share_zt0_only)(void) __arm_inout("zt0")
   share_zt0_only();
 }
 
-// expected-cpp-error@+2 {{streaming function cannot be multi-versioned}}
-// expected-error@+1 {{streaming function cannot be multi-versioned}}
-__attribute__((target_version("sme2")))
-void cannot_work_version(void) __arm_streaming {}
-// expected-cpp-error@+5 {{function declared 'void ()' was previously declared 'void () __arm_streaming', which has different SME function attributes}}
-// expected-cpp-note@-2 {{previous declaration is here}}
-// expected-error@+3 {{function declared 'void (void)' was previously declared 'void (void) __arm_streaming', which has different SME function attributes}}
-// expected-note@-4 {{previous declaration is here}}
-__attribute__((target_version("default")))
-void cannot_work_version(void) {}
-
-
-// expected-cpp-error@+2 {{streaming function cannot be multi-versioned}}
-// expected-error@+1 {{streaming function cannot be multi-versioned}}
-__attribute__((target_clones("sme2")))
-void cannot_work_clones(void) __arm_streaming {}
-
-
-__attribute__((target("sme2")))
-void just_fine_streaming(void) __arm_streaming {}
-__attribute__((target_version("sme2")))
-void just_fine(void) { just_fine_streaming(); }
-__attribute__((target_version("default")))
-void just_fine(void) {}
-
-
-__arm_locally_streaming
-__attribute__((target_version("sme2")))
-void incompatible_locally_streaming(void) {}
-// expected-error@-1 {{attribute 'target_version' multiversioning cannot be combined with attribute '__arm_locally_streaming'}}
-// expected-cpp-error@-2 {{attribute 'target_version' multiversioning cannot be combined with attribute '__arm_locally_streaming'}}
-__attribute__((target_version("default")))
-void incompatible_locally_streaming(void) {}
-
-
-void fmv_caller() {
-    cannot_work_version();
-    cannot_work_clones();
-    just_fine();
-    incompatible_locally_streaming();
-}
-
 void sme_streaming_with_vl_arg(__SVInt8_t a) __arm_streaming { }
 
 __SVInt8_t sme_streaming_returns_vl(void) __arm_streaming { __SVInt8_t r; return r; }
diff --git a/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp b/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
index d18aaad..357c9e5 100644
--- a/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
+++ b/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
@@ -1,10 +1,11 @@
 // RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan \
-// RUN: -triple powerpc64le-unknown-unknown %s -menable-no-infs \
-// RUN: -menable-no-nans -std=c++23
+// RUN: -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-infs -menable-no-nans -std=c++23
 
 // RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan \
-// RUN: -triple powerpc64le-unknown-unknown %s -menable-no-infs \
-// RUN: -menable-no-nans -funsafe-math-optimizations -std=c++23
+// RUN: -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-infs -menable-no-nans -funsafe-math-optimizations \
+// RUN: -std=c++23
 
 // RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \
 // RUN: %s -std=c++23
diff --git a/clang/test/Sema/warn-infinity-nan-disabled-win.cpp b/clang/test/Sema/warn-infinity-nan-disabled-win.cpp
index 51f9d32..ee4eb33 100644
--- a/clang/test/Sema/warn-infinity-nan-disabled-win.cpp
+++ b/clang/test/Sema/warn-infinity-nan-disabled-win.cpp
@@ -6,8 +6,9 @@
 // RUN: -menable-no-nans -std=c++23
 
 // RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan \
-// RUN: -triple powerpc64le-unknown-unknown %s -menable-no-infs \
-// RUN: -menable-no-nans -funsafe-math-optimizations -std=c++23
+// RUN: -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-infs -menable-no-nans -funsafe-math-optimizations \
+// RUN: -std=c++23
 
 // RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \
 // RUN: %s -std=c++23
diff --git a/clang/test/SemaCXX/pr100095.cpp b/clang/test/SemaCXX/pr100095.cpp
new file mode 100644
index 0000000..15913fe
--- /dev/null
+++ b/clang/test/SemaCXX/pr100095.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++11 %s
+// XFAIL: asserts
+
+template <class> struct Pair;
+template <class...> struct Tuple {
+  template <class _Up> Tuple(_Up);
+};
+template <typename> struct StatusOr;
+template <int> using ElementType = int;
+template <int... fields>
+using Key = Tuple<ElementType<fields>...>;
+template <int... fields>
+StatusOr<Pair<Key<fields...>>> Parser();
+struct Helper { Helper(Tuple<>, Tuple<>, int, int); };
+struct D : Helper {
+  D(Key<> f, int n, int e) : Helper(f, Parser<>, n, e) {}
+};
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index 23b07ca..e131212 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -2402,11 +2402,11 @@ template<typename T> struct DerivedB : BaseA<T> { };
 template<typename T> struct CrazyDerived : T { };
 
 
-class class_forward; // expected-note 2 {{forward declaration of 'class_forward'}}
+class class_forward; // expected-note 4 {{forward declaration of 'class_forward'}}
 
 template <class T> class DerivedTemp : Base {};
 template <class T> class NonderivedTemp {};
-template <class T> class UndefinedTemp; // expected-note {{declared here}}
+template <class T> class UndefinedTemp; // expected-note 2 {{declared here}}
 
 void is_base_of() {
   static_assert(__is_base_of(Base, Derived));
@@ -2457,6 +2457,76 @@ void is_base_of() {
   static_assert(!__is_base_of(DerivedB<int>, BaseA<int>));
 }
 
+struct DerivedTransitiveViaNonVirtual : Derived3 {};
+struct DerivedTransitiveViaVirtual : virtual Derived3 {};
+
+template <typename T>
+struct CrazyDerivedVirtual : virtual T {};
+
+struct DerivedPrivate : private virtual Base {};
+struct DerivedProtected : protected virtual Base {};
+struct DerivedPrivatePrivate : private DerivedPrivate {};
+struct DerivedPrivateProtected : private DerivedProtected {};
+struct DerivedProtectedPrivate : protected DerivedProtected {};
+struct DerivedProtectedProtected : protected DerivedProtected {};
+
+void is_virtual_base_of(int n) {
+  static_assert(!__builtin_is_virtual_base_of(Base, Derived));
+  static_assert(!__builtin_is_virtual_base_of(const Base, Derived));
+  static_assert(!__builtin_is_virtual_base_of(Derived, Base));
+  static_assert(!__builtin_is_virtual_base_of(Derived, int));
+  static_assert(!__builtin_is_virtual_base_of(Base, Base));
+  static_assert(!__builtin_is_virtual_base_of(Base, Derived3));
+  static_assert(!__builtin_is_virtual_base_of(Derived, Derived3));
+  static_assert(__builtin_is_virtual_base_of(Derived2b, Derived3));
+  static_assert(__builtin_is_virtual_base_of(Derived2a, Derived3));
+  static_assert(!__builtin_is_virtual_base_of(BaseA<int>, DerivedB<int>));
+  static_assert(!__builtin_is_virtual_base_of(DerivedB<int>, BaseA<int>));
+  static_assert(!__builtin_is_virtual_base_of(Union, Union));
+  static_assert(!__builtin_is_virtual_base_of(Empty, Empty));
+  static_assert(!__builtin_is_virtual_base_of(class_forward, class_forward)); // expected-error {{incomplete type 'class_forward' where a complete type is required}}
+  static_assert(!__builtin_is_virtual_base_of(Empty, class_forward)); // expected-error {{incomplete type 'class_forward' where a complete type is required}}
+  static_assert(!__builtin_is_virtual_base_of(class_forward, Empty));
+  static_assert(!__builtin_is_virtual_base_of(Base&, Derived&));
+  static_assert(!__builtin_is_virtual_base_of(Base[10], Derived[10]));
+  static_assert(!__builtin_is_virtual_base_of(Base[n], Derived[n])); // expected-error 2 {{variable length arrays are not supported in '__builtin_is_virtual_base_of'}}
+  static_assert(!__builtin_is_virtual_base_of(int, int));
+  static_assert(!__builtin_is_virtual_base_of(int[], int[]));
+  static_assert(!__builtin_is_virtual_base_of(long, int));
+  static_assert(!__builtin_is_virtual_base_of(Base, DerivedTemp<int>));
+  static_assert(!__builtin_is_virtual_base_of(Base, NonderivedTemp<int>));
+  static_assert(!__builtin_is_virtual_base_of(Base, UndefinedTemp<int>)); // expected-error {{implicit instantiation of undefined template 'UndefinedTemp<int>'}}
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedPrivate));
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedProtected));
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedPrivatePrivate));
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedPrivateProtected));
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedProtectedPrivate));
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedProtectedProtected));
+  static_assert(__builtin_is_virtual_base_of(Derived2a, DerivedTransitiveViaNonVirtual));
+  static_assert(__builtin_is_virtual_base_of(Derived2b, DerivedTransitiveViaNonVirtual));
+  static_assert(__builtin_is_virtual_base_of(Derived2a, DerivedTransitiveViaVirtual));
+  static_assert(__builtin_is_virtual_base_of(Derived2b, DerivedTransitiveViaVirtual));
+  static_assert(!__builtin_is_virtual_base_of(Base, CrazyDerived<Base>));
+  static_assert(!__builtin_is_virtual_base_of(CrazyDerived<Base>, Base));
+  static_assert(__builtin_is_virtual_base_of(Base, CrazyDerivedVirtual<Base>));
+  static_assert(!__builtin_is_virtual_base_of(CrazyDerivedVirtual<Base>, Base));
+
+  static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, IncompleteUnion));
+  static_assert(!__builtin_is_virtual_base_of(Union, IncompleteUnion));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, Union));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteStruct, IncompleteUnion));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, IncompleteStruct));
+  static_assert(!__builtin_is_virtual_base_of(Empty, IncompleteUnion));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, Empty));
+  static_assert(!__builtin_is_virtual_base_of(int, IncompleteUnion));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, int));
+  static_assert(!__builtin_is_virtual_base_of(Empty, Union));
+  static_assert(!__builtin_is_virtual_base_of(Union, Empty));
+  static_assert(!__builtin_is_virtual_base_of(int, Empty));
+  static_assert(!__builtin_is_virtual_base_of(Union, int));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteStruct, IncompleteStruct[n])); // expected-error {{variable length arrays are not supported in '__builtin_is_virtual_base_of'}}
+}
+
 template<class T, class U>
 class TemplateClass {};
 
diff --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
index 3885166..30c45ed 100644
--- a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
+++ b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
@@ -598,10 +598,11 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) {
         Res.Prevailing = !Sym.isUndefined() && ObjSym.File == *BitcodeFile;
 
         // We need LTO to preseve the following global symbols:
-        // 1) Symbols used in regular objects.
-        // 2) Prevailing symbols that are needed visible to the gpu runtime.
+        // 1) All symbols during a relocatable link.
+        // 2) Symbols used in regular objects.
+        // 3) Prevailing symbols that are needed visible to the gpu runtime.
         Res.VisibleToRegularObj =
-            ObjSym.UsedInRegularObj ||
+            Args.hasArg(OPT_relocatable) || ObjSym.UsedInRegularObj ||
             (Res.Prevailing &&
              (Sym.getVisibility() != GlobalValue::HiddenVisibility &&
               !Sym.canBeOmittedFromSymbolTable()));
diff --git a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
index 8c80a51..01bd0f8 100644
--- a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
+++ b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
@@ -73,6 +73,10 @@ def plugin_opt : Joined<["--", "-"], "plugin-opt=">, Flags<[WrapperOnlyOption]>,
 def save_temps : Flag<["--", "-"], "save-temps">,
   Flags<[WrapperOnlyOption]>, HelpText<"Save intermediate results">;
 
+def relocatable : Flag<["--", "-"], "relocatable">,
+  Flags<[WrapperOnlyOption]>, HelpText<"Perform a relocatable link (LTO only)">;
+def r : Flag<["-"], "r">, Flags<[WrapperOnlyOption]>, Alias<relocatable>;
+
 def whole_archive : Flag<["--", "-"], "whole-archive">,
   Flags<[WrapperOnlyOption, HelpHidden]>;
 def no_whole_archive : Flag<["--", "-"], "no-whole-archive">,
@@ -83,8 +87,7 @@ def mllvm : Separate<["-"], "mllvm">, Flags<[WrapperOnlyOption]>,
   HelpText<"Arguments passed to LLVM, including Clang invocations, for which "
            "the '-mllvm' prefix is preserved. Use '-mllvm --help' for a list "
            "of options.">;
-def mllvm_EQ : Joined<["-"], "mllvm=">, Flags<[HelpHidden]>,
-  Alias<mllvm>;
+def mllvm_EQ : Joined<["-"], "mllvm=">, Flags<[HelpHidden]>, Alias<mllvm>;
 
 def dry_run : Flag<["--", "-"], "dry-run">, Flags<[WrapperOnlyOption]>,
   HelpText<"Print generated commands without running.">;
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 6d987cc..9b12caa 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -9783,6 +9783,43 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportExistingEmptyAnonymousEnums) {
   EXPECT_EQ(ImportedE2, ToE1);
 }
 
+TEST_P(ASTImporterOptionSpecificTestBase, ImportMultipleAnonymousEnumDecls) {
+  Decl *ToTU = getToTuDecl("", Lang_CXX03);
+  Decl *FromTU = getTuDecl(
+      R"(
+        struct foo {
+          enum { A };
+          enum { B };
+        };
+      )",
+      Lang_CXX03);
+
+  auto EnumConstA = enumConstantDecl(hasName("A"));
+  auto EnumConstB = enumConstantDecl(hasName("B"));
+
+  auto *FromA = FirstDeclMatcher<EnumConstantDecl>().match(FromTU, EnumConstA);
+  auto *FromB = FirstDeclMatcher<EnumConstantDecl>().match(FromTU, EnumConstB);
+
+  auto *ToA = Import(FromA, Lang_CXX03);
+  auto *ToB = Import(FromB, Lang_CXX03);
+
+  ASSERT_TRUE(ToA);
+  ASSERT_TRUE(ToB);
+
+  auto *ToFooA = FirstDeclMatcher<CXXRecordDecl>().match(
+      ToTU, tagDecl(has(enumDecl(has(EnumConstA)))));
+  auto *ToFooB = FirstDeclMatcher<CXXRecordDecl>().match(
+      ToTU, tagDecl(has(enumDecl(has(EnumConstB)))));
+  ASSERT_EQ(ToFooA, ToFooB);
+
+  // different EnumDecl
+  auto *ToEnumDeclA =
+      FirstDeclMatcher<EnumDecl>().match(ToTU, enumDecl(has(EnumConstA)));
+  auto *ToEnumDeclB =
+      FirstDeclMatcher<EnumDecl>().match(ToTU, enumDecl(has(EnumConstB)));
+  ASSERT_NE(ToEnumDeclA, ToEnumDeclB);
+}
+
 INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ASTImporterLookupTableTest,
                          DefaultTestValuesForRunOptions);
 
diff --git a/clang/unittests/AST/StructuralEquivalenceTest.cpp b/clang/unittests/AST/StructuralEquivalenceTest.cpp
index 952c83b..e994086 100644
--- a/clang/unittests/AST/StructuralEquivalenceTest.cpp
+++ b/clang/unittests/AST/StructuralEquivalenceTest.cpp
@@ -1109,6 +1109,20 @@ TEST_F(StructuralEquivalenceEnumTest, EnumsWithDifferentBody) {
   EXPECT_FALSE(testStructuralMatch(t));
 }
 
+TEST_F(StructuralEquivalenceEnumTest, AnonymousEnumsWithSameConsts) {
+  // field x is required to trigger comparison of the anonymous enum
+  auto t = makeNamedDecls("struct foo { enum { A } x; };",
+                          "struct foo { enum { A } x;};", Lang_CXX11);
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceEnumTest, AnonymousEnumsWithDiffConsts) {
+  // field x is required to trigger comparison of the anonymous enum
+  auto t = makeNamedDecls("struct foo { enum { A } x; };",
+                          "struct foo { enum { B } x;};", Lang_CXX11);
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
 struct StructuralEquivalenceEnumConstantTest : StructuralEquivalenceTest {};
 
 TEST_F(StructuralEquivalenceEnumConstantTest, EnumConstantsWithSameValues) {
diff --git a/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp b/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp
index 41fca6b..737277e 100644
--- a/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp
@@ -442,6 +442,55 @@ TEST_F(EnvironmentTest, CXXDefaultInitExprResultObjIsWrappedExprResultObj) {
             &Env.getResultObjectLocation(*DefaultInit->getExpr()));
 }
 
+// This test verifies the behavior of `getResultObjectLocation()` in
+// scenarios involving inherited constructors.
+// Since the specific AST node of interest `CXXConstructorDecl` is implicitly
+// generated, we cannot annotate any statements inside of it as we do in tests
+// within TransferTest. Thus, the only way to get the right `Environment` is by
+// explicitly initializing it as we do in tests within EnvironmentTest.
+// This is why this test is not inside TransferTest, where most of the tests for
+// `getResultObjectLocation()` are located.
+TEST_F(EnvironmentTest, ResultObjectLocationForInheritedCtorInitExpr) {
+  using namespace ast_matchers;
+
+  std::string Code = R"(
+    struct Base {
+      Base(int b) {}
+    };
+    struct Derived : Base {
+      using Base::Base;
+    };
+
+    Derived d = Derived(0);
+  )";
+
+  auto Unit =
+      tooling::buildASTFromCodeWithArgs(Code, {"-fsyntax-only", "-std=c++20"});
+  auto &Context = Unit->getASTContext();
+
+  ASSERT_EQ(Context.getDiagnostics().getClient()->getNumErrors(), 0U);
+
+  auto Results =
+      match(cxxConstructorDecl(
+                hasAnyConstructorInitializer(cxxCtorInitializer(
+                    withInitializer(expr().bind("inherited_ctor_init_expr")))))
+                .bind("ctor"),
+            Context);
+  const auto *Constructor = selectFirst<CXXConstructorDecl>("ctor", Results);
+  const auto *InheritedCtorInit = selectFirst<CXXInheritedCtorInitExpr>(
+      "inherited_ctor_init_expr", Results);
+
+  EXPECT_EQ(InheritedCtorInit->child_begin(), InheritedCtorInit->child_end());
+
+  Environment Env(DAContext, *Constructor);
+  Env.initialize();
+
+  RecordStorageLocation &Loc = Env.getResultObjectLocation(*InheritedCtorInit);
+  EXPECT_NE(&Loc, nullptr);
+
+  EXPECT_EQ(&Loc, Env.getThisPointeeStorageLocation());
+}
+
 TEST_F(EnvironmentTest, Stmt) {
   using namespace ast_matchers;
 
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 8c892d9..7c7f003 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1072,7 +1072,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
     opts.setFPContractMode(fpContractMode);
   }
 
-  if (args.getLastArg(clang::driver::options::OPT_menable_no_infinities)) {
+  if (args.getLastArg(clang::driver::options::OPT_menable_no_infs)) {
     opts.NoHonorInfs = true;
   }
 
diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index 5b3ea21..60b8b32 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -307,6 +307,25 @@ private:
       WarnOnIoStmt(source);
     }
   }
+  template <typename A>
+  void ErrorIfHostSymbol(const A &expr, const parser::CharBlock &source) {
+    for (const Symbol &sym : CollectCudaSymbols(expr)) {
+      if (const auto *details =
+              sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
+        if (details->IsArray() &&
+            (!details->cudaDataAttr() ||
+                (details->cudaDataAttr() &&
+                    *details->cudaDataAttr() != common::CUDADataAttr::Device &&
+                    *details->cudaDataAttr() != common::CUDADataAttr::Managed &&
+                    *details->cudaDataAttr() !=
+                        common::CUDADataAttr::Unified))) {
+          context_.Say(source,
+              "Host array '%s' cannot be present in CUF kernel"_err_en_US,
+              sym.name());
+        }
+      }
+    }
+  }
   void Check(const parser::ActionStmt &stmt, const parser::CharBlock &source) {
     common::visit(
         common::visitors{
@@ -349,6 +368,19 @@ private:
             [&](const common::Indirection<parser::IfStmt> &x) {
               Check(x.value());
             },
+            [&](const common::Indirection<parser::AssignmentStmt> &x) {
+              if (IsCUFKernelDo) {
+                const evaluate::Assignment *assign{
+                    semantics::GetAssignment(x.value())};
+                if (assign) {
+                  ErrorIfHostSymbol(assign->lhs, source);
+                  ErrorIfHostSymbol(assign->rhs, source);
+                }
+              }
+              if (auto msg{ActionStmtChecker<IsCUFKernelDo>::WhyNotOk(x)}) {
+                context_.Say(source, std::move(*msg));
+              }
+            },
             [&](const auto &x) {
               if (auto msg{ActionStmtChecker<IsCUFKernelDo>::WhyNotOk(x)}) {
                 context_.Say(source, std::move(*msg));
diff --git a/flang/runtime/transformational.cpp b/flang/runtime/transformational.cpp
index cf1e61c..b6b204b 100644
--- a/flang/runtime/transformational.cpp
+++ b/flang/runtime/transformational.cpp
@@ -508,7 +508,8 @@ void RTDEF(CshiftVector)(Descriptor &result, const Descriptor &source,
   SubscriptValue lb{sourceDim.LowerBound()};
   for (SubscriptValue j{0}; j < extent; ++j) {
     SubscriptValue resultAt{1 + j};
-    SubscriptValue sourceAt{lb + (j + shift) % extent};
+    SubscriptValue sourceAt{
+        lb + static_cast<SubscriptValue>(j + shift) % extent};
     if (sourceAt < lb) {
       sourceAt += extent;
     }
@@ -619,7 +620,7 @@ void RTDEF(EoshiftVector)(Descriptor &result, const Descriptor &source,
   }
   SubscriptValue lb{source.GetDimension(0).LowerBound()};
   for (SubscriptValue j{1}; j <= extent; ++j) {
-    SubscriptValue sourceAt{lb + j - 1 + shift};
+    SubscriptValue sourceAt{lb + j - 1 + static_cast<SubscriptValue>(shift)};
     if (sourceAt >= lb && sourceAt < lb + extent) {
       CopyElement(result, &j, source, &sourceAt, terminator);
     } else if (boundary) {
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index d657f81..7eb74a4 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -133,7 +133,7 @@ subroutine sub4()
   integer, parameter :: n = 10
   real, device :: adev(n)
   real :: ahost(n)
-  real :: b
+  real, managed :: b
   integer :: i
 
   adev = ahost
diff --git a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
index 99cb6eb..ba5d390 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
@@ -7,8 +7,8 @@ subroutine sub1()
   integer :: i, j
   integer, parameter :: n = 100
   integer(8) :: istream
-  real :: a(n), b(n)
-  real :: c(n,n), d(n,n)
+  real, device :: a(n), b(n)
+  real, device :: c(n,n), d(n,n)
 
 ! CHECK-LABEL: func.func @_QPsub1()
 ! CHECK: %[[IV:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Parser/cuf-sanity-common b/flang/test/Parser/cuf-sanity-common
index 9d73204..9341f05 100644
--- a/flang/test/Parser/cuf-sanity-common
+++ b/flang/test/Parser/cuf-sanity-common
@@ -23,7 +23,8 @@ module m
   end subroutine
   subroutine test
     logical isPinned
-    real a(10), x, y, z
+    real, device :: a(10)
+    real :: x, y, z
     !$cuf kernel do(1) <<<*, *, stream = 1>>>
     do j = 1, 10
     end do
diff --git a/flang/test/Semantics/cuf09.cuf b/flang/test/Semantics/cuf09.cuf
index d2d4d23..195ddac 100644
--- a/flang/test/Semantics/cuf09.cuf
+++ b/flang/test/Semantics/cuf09.cuf
@@ -18,6 +18,8 @@ module m
 end
 
 program main
+  integer, device :: a_d(10 ,10)
+  integer :: b(10, 10)
   !$cuf kernel do <<< *, * >>> ! ok
   do j = 1, 0
   end do
@@ -90,4 +92,12 @@ program main
     else if (ifunc() /= 1) then
     end if
   end do
+
+  !$cuf kernel do (2) <<<*, *>>>
+  do j = 1, 10
+     do i = 1, 10
+        !ERROR: Host array 'b' cannot be present in CUF kernel
+        a_d(i,j) = b(i,j)
+     enddo
+  enddo
 end
diff --git a/flang/test/Semantics/reduce.cuf b/flang/test/Semantics/reduce.cuf
index 95ff2e8..92d12ab1 100644
--- a/flang/test/Semantics/reduce.cuf
+++ b/flang/test/Semantics/reduce.cuf
@@ -1,9 +1,9 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1
 subroutine s(n,m,a,l)
   integer, intent(in) :: n
-  integer, intent(in) :: m(n)
-  real, intent(in) :: a(n)
-  logical, intent(in) :: l(n)
+  integer, device, intent(in) :: m(n)
+  real, device, intent(in) :: a(n)
+  logical, device, intent(in) :: l(n)
   integer j, mr
   real ar
   logical lr
diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
index 3b50666..38eace2 100644
--- a/libc/config/darwin/arm/entrypoints.txt
+++ b/libc/config/darwin/arm/entrypoints.txt
@@ -120,6 +120,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.acoshf
     libc.src.math.asinf
     libc.src.math.asinhf
+    libc.src.math.atan2
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 2334fed..b2c5341 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -343,6 +343,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.acoshf
     libc.src.math.asinf
     libc.src.math.asinhf
+    libc.src.math.atan2
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 61ee68a..8e77105 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -213,6 +213,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.acoshf
     libc.src.math.asinf
     libc.src.math.asinhf
+    libc.src.math.atan2
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 0746680..e3ed5a5 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -365,6 +365,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.acoshf
     libc.src.math.asinf
     libc.src.math.asinhf
+    libc.src.math.atan2
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 035ceb8..96f9755 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -365,6 +365,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.acoshf
     libc.src.math.asinf
     libc.src.math.asinhf
+    libc.src.math.atan2
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index b6aced8..06c3682 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -118,6 +118,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.acoshf
     libc.src.math.asinf
     libc.src.math.asinhf
+    libc.src.math.atan2
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index e7db07f..9f88b4d 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -260,7 +260,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | atan      | |check|          |                 |                        |                      |                        | 7.12.4.3               | F.10.1.3                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| atan2     | |check|          |                 |                        |                      |                        | 7.12.4.4               | F.10.1.4                   |
+| atan2     | |check|          | 1 ULP           |                        |                      |                        | 7.12.4.4               | F.10.1.4                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | atan2pi   |                  |                 |                        |                      |                        | 7.12.4.11              | F.10.1.11                  |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index d5a5cb6..9c84acc 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -692,6 +692,7 @@ def StdC : StandardSpec<"stdc"> {
 
           FunctionSpec<"atanf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
 
+          FunctionSpec<"atan2", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"atan2f", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
 
           FunctionSpec<"acoshf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 53155c6..27b5b94 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -3849,6 +3849,26 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  atan2
+  SRCS
+    atan2.cpp
+  HDRS
+    ../atan2.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    .inv_trigf_utils
+    libc.src.__support.FPUtil.double_double
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+)
+
+add_entrypoint_object(
   scalblnf16
   SRCS
     scalblnf16.cpp
diff --git a/libc/src/math/generic/atan2.cpp b/libc/src/math/generic/atan2.cpp
new file mode 100644
index 0000000..c39deeb
--- /dev/null
+++ b/libc/src/math/generic/atan2.cpp
@@ -0,0 +1,313 @@
+//===-- Double-precision atan2 function -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atan2.h"
+#include "inv_trigf_utils.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace {
+
+using DoubleDouble = fputil::DoubleDouble;
+
+// atan(i/64) with i = 0..64, generated by Sollya with:
+// > for i from 0 to 64 do {
+//     a = round(atan(i/64), D, RN);
+//     b = round(atan(i/64) - a, D, RN);
+//     print("{", b, ",", a, "},");
+//   };
+constexpr fputil::DoubleDouble ATAN_I[65] = {
+    {0.0, 0.0},
+    {-0x1.220c39d4dff5p-61, 0x1.fff555bbb729bp-7},
+    {-0x1.5ec431444912cp-60, 0x1.ffd55bba97625p-6},
+    {-0x1.86ef8f794f105p-63, 0x1.7fb818430da2ap-5},
+    {-0x1.c934d86d23f1dp-60, 0x1.ff55bb72cfdeap-5},
+    {0x1.ac4ce285df847p-58, 0x1.3f59f0e7c559dp-4},
+    {-0x1.cfb654c0c3d98p-58, 0x1.7ee182602f10fp-4},
+    {0x1.f7b8f29a05987p-58, 0x1.be39ebe6f07c3p-4},
+    {-0x1.cd37686760c17p-59, 0x1.fd5ba9aac2f6ep-4},
+    {-0x1.b485914dacf8cp-59, 0x1.1e1fafb043727p-3},
+    {0x1.61a3b0ce9281bp-57, 0x1.3d6eee8c6626cp-3},
+    {-0x1.054ab2c010f3dp-58, 0x1.5c9811e3ec26ap-3},
+    {0x1.347b0b4f881cap-58, 0x1.7b97b4bce5b02p-3},
+    {0x1.cf601e7b4348ep-59, 0x1.9a6a8e96c8626p-3},
+    {0x1.17b10d2e0e5abp-61, 0x1.b90d7529260a2p-3},
+    {0x1.c648d1534597ep-57, 0x1.d77d5df205736p-3},
+    {0x1.8ab6e3cf7afbdp-57, 0x1.f5b75f92c80ddp-3},
+    {0x1.62e47390cb865p-56, 0x1.09dc597d86362p-2},
+    {0x1.30ca4748b1bf9p-57, 0x1.18bf5a30bf178p-2},
+    {-0x1.077cdd36dfc81p-56, 0x1.278372057ef46p-2},
+    {-0x1.963a544b672d8p-57, 0x1.362773707ebccp-2},
+    {-0x1.5d5e43c55b3bap-56, 0x1.44aa436c2af0ap-2},
+    {-0x1.2566480884082p-57, 0x1.530ad9951cd4ap-2},
+    {-0x1.a725715711fp-56, 0x1.614840309cfe2p-2},
+    {-0x1.c63aae6f6e918p-56, 0x1.6f61941e4def1p-2},
+    {0x1.69c885c2b249ap-56, 0x1.7d5604b63b3f7p-2},
+    {0x1.b6d0ba3748fa8p-56, 0x1.8b24d394a1b25p-2},
+    {0x1.9e6c988fd0a77p-56, 0x1.98cd5454d6b18p-2},
+    {-0x1.24dec1b50b7ffp-56, 0x1.a64eec3cc23fdp-2},
+    {0x1.ae187b1ca504p-56, 0x1.b3a911da65c6cp-2},
+    {-0x1.cc1ce70934c34p-56, 0x1.c0db4c94ec9fp-2},
+    {-0x1.a2cfa4418f1adp-56, 0x1.cde53432c1351p-2},
+    {0x1.a2b7f222f65e2p-56, 0x1.dac670561bb4fp-2},
+    {0x1.0e53dc1bf3435p-56, 0x1.e77eb7f175a34p-2},
+    {-0x1.a3992dc382a23p-57, 0x1.f40dd0b541418p-2},
+    {-0x1.b32c949c9d593p-55, 0x1.0039c73c1a40cp-1},
+    {-0x1.d5b495f6349e6p-56, 0x1.0657e94db30dp-1},
+    {0x1.974fa13b5404fp-58, 0x1.0c6145b5b43dap-1},
+    {-0x1.2bdaee1c0ee35p-58, 0x1.1255d9bfbd2a9p-1},
+    {0x1.c621cec00c301p-55, 0x1.1835a88be7c13p-1},
+    {-0x1.928df287a668fp-58, 0x1.1e00babdefeb4p-1},
+    {0x1.c421c9f38224ep-57, 0x1.23b71e2cc9e6ap-1},
+    {-0x1.09e73b0c6c087p-56, 0x1.2958e59308e31p-1},
+    {0x1.c5d5e9ff0cf8dp-55, 0x1.2ee628406cbcap-1},
+    {0x1.1021137c71102p-55, 0x1.345f01cce37bbp-1},
+    {-0x1.2304331d8bf46p-55, 0x1.39c391cd4171ap-1},
+    {0x1.ecf8b492644fp-56, 0x1.3f13fb89e96f4p-1},
+    {-0x1.f76d0163f79c8p-56, 0x1.445065b795b56p-1},
+    {0x1.2419a87f2a458p-56, 0x1.4978fa3269ee1p-1},
+    {0x1.4a33dbeb3796cp-55, 0x1.4e8de5bb6ec04p-1},
+    {-0x1.1bb74abda520cp-55, 0x1.538f57b89061fp-1},
+    {-0x1.5e5c9d8c5a95p-56, 0x1.587d81f732fbbp-1},
+    {0x1.0028e4bc5e7cap-57, 0x1.5d58987169b18p-1},
+    {-0x1.2b785350ee8c1p-57, 0x1.6220d115d7b8ep-1},
+    {-0x1.6ea6febe8bbbap-56, 0x1.66d663923e087p-1},
+    {-0x1.a80386188c50ep-55, 0x1.6b798920b3d99p-1},
+    {-0x1.8c34d25aadef6p-56, 0x1.700a7c5784634p-1},
+    {0x1.7b2a6165884a1p-59, 0x1.748978fba8e0fp-1},
+    {0x1.406a08980374p-55, 0x1.78f6bbd5d315ep-1},
+    {0x1.560821e2f3aa9p-55, 0x1.7d528289fa093p-1},
+    {-0x1.bf76229d3b917p-56, 0x1.819d0b7158a4dp-1},
+    {0x1.6b66e7fc8b8c3p-57, 0x1.85d69576cc2c5p-1},
+    {-0x1.55b9a5e177a1bp-55, 0x1.89ff5ff57f1f8p-1},
+    {-0x1.ec182ab042f61p-56, 0x1.8e17aa99cc05ep-1},
+    {0x1.1a62633145c07p-55, 0x1.921fb54442d18p-1},
+};
+
+// Approximate atan(x) for |x| <= 2^-7.
+// Using degree-9 Taylor polynomial:
+//  P = x - x^3/3 + x^5/5 -x^7/7 + x^9/9;
+// Then the absolute error is bounded by:
+//   |atan(x) - P(x)| < |x|^11/11 < 2^(-7*11) / 11 < 2^-80.
+// And the relative error is bounded by:
+//   |(atan(x) - P(x))/atan(x)| < |x|^10 / 10 < 2^-73.
+// For x = x_hi + x_lo, fully expand the polynomial and drop any terms less than
+//   ulp(x_hi^3 / 3) gives us:
+// P(x) ~ x_hi - x_hi^3/3 + x_hi^5/5 - x_hi^7/7 + x_hi^9/9 +
+//        + x_lo * (1 - x_hi^2 + x_hi^4)
+DoubleDouble atan_eval(const DoubleDouble &x) {
+  DoubleDouble p;
+  p.hi = x.hi;
+  double x_hi_sq = x.hi * x.hi;
+  // c0 ~ x_hi^2 * 1/5 - 1/3
+  double c0 = fputil::multiply_add(x_hi_sq, 0x1.999999999999ap-3,
+                                   -0x1.5555555555555p-2);
+  // c1 ~ x_hi^2 * 1/9 - 1/7
+  double c1 = fputil::multiply_add(x_hi_sq, 0x1.c71c71c71c71cp-4,
+                                   -0x1.2492492492492p-3);
+  // x_hi^3
+  double x_hi_3 = x_hi_sq * x.hi;
+  // x_hi^4
+  double x_hi_4 = x_hi_sq * x_hi_sq;
+  // d0 ~ 1/3 - x_hi^2 / 5 + x_hi^4 / 7 - x_hi^6 / 9
+  double d0 = fputil::multiply_add(x_hi_4, c1, c0);
+  // x_lo - x_lo * x_hi^2 + x_lo * x_hi^4
+  double d1 = fputil::multiply_add(x_hi_4 - x_hi_sq, x.lo, x.lo);
+  // p.lo ~ -x_hi^3/3 + x_hi^5/5 - x_hi^7/7 + x_hi^9/9 +
+  //        + x_lo * (1 - x_hi^2 + x_hi^4)
+  p.lo = fputil::multiply_add(x_hi_3, d0, d1);
+  return p;
+}
+
+} // anonymous namespace
+
+// There are several range reduction steps we can take for atan2(y, x) as
+// follow:
+
+// * Range reduction 1: signness
+// atan2(y, x) will return a number between -PI and PI representing the angle
+// forming by the 0x axis and the vector (x, y) on the 0xy-plane.
+// In particular, we have that:
+//   atan2(y, x) = atan( y/x )         if x >= 0 and y >= 0 (I-quadrant)
+//               = pi + atan( y/x )    if x < 0 and y >= 0  (II-quadrant)
+//               = -pi + atan( y/x )   if x < 0 and y < 0   (III-quadrant)
+//               = atan( y/x )         if x >= 0 and y < 0  (IV-quadrant)
+// Since atan function is odd, we can use the formula:
+//   atan(-u) = -atan(u)
+// to adjust the above conditions a bit further:
+//   atan2(y, x) = atan( |y|/|x| )         if x >= 0 and y >= 0 (I-quadrant)
+//               = pi - atan( |y|/|x| )    if x < 0 and y >= 0  (II-quadrant)
+//               = -pi + atan( |y|/|x| )   if x < 0 and y < 0   (III-quadrant)
+//               = -atan( |y|/|x| )        if x >= 0 and y < 0  (IV-quadrant)
+// Which can be simplified to:
+//   atan2(y, x) = sign(y) * atan( |y|/|x| )             if x >= 0
+//               = sign(y) * (pi - atan( |y|/|x| ))      if x < 0
+
+// * Range reduction 2: reciprocal
+// Now that the argument inside atan is positive, we can use the formula:
+//   atan(1/x) = pi/2 - atan(x)
+// to make the argument inside atan <= 1 as follow:
+//   atan2(y, x) = sign(y) * atan( |y|/|x|)            if 0 <= |y| <= x
+//               = sign(y) * (pi/2 - atan( |x|/|y| )   if 0 <= x < |y|
+//               = sign(y) * (pi - atan( |y|/|x| ))    if 0 <= |y| <= -x
+//               = sign(y) * (pi/2 + atan( |x|/|y| ))  if 0 <= -x < |y|
+
+// * Range reduction 3: look up table.
+// After the previous two range reduction steps, we reduce the problem to
+// compute atan(u) with 0 <= u <= 1, or to be precise:
+//   atan( n / d ) where n = min(|x|, |y|) and d = max(|x|, |y|).
+// An accurate polynomial approximation for the whole [0, 1] input range will
+// require a very large degree.  To make it more efficient, we reduce the input
+// range further by finding an integer idx such that:
+//   | n/d - idx/64 | <= 1/128.
+// In particular,
+//   idx := round(2^6 * n/d)
+// Then for the fast pass, we find a polynomial approximation for:
+//   atan( n/d ) ~ atan( idx/64 ) + (n/d - idx/64) * Q(n/d - idx/64)
+// For the accurate pass, we use the addition formula:
+//   atan( n/d ) - atan( idx/64 ) = atan( (n/d - idx/64)/(1 + (n*idx)/(64*d)) )
+//                                = atan( (n - d*(idx/64))/(d + n*(idx/64)) )
+// And for the fast pass, we use degree-9 Taylor polynomial to compute the RHS:
+//   atan(u) ~ P(u) = u - u^3/3 + u^5/5 - u^7/7 + u^9/9
+// with absolute errors bounded by:
+//   |atan(u) - P(u)| < |u|^11 / 11 < 2^-80
+// and relative errors bounded by:
+//   |(atan(u) - P(u)) / P(u)| < u^10 / 11 < 2^-73.
+
+LLVM_LIBC_FUNCTION(double, atan2, (double y, double x)) {
+  using FPBits = fputil::FPBits<double>;
+
+  constexpr double IS_NEG[2] = {1.0, -1.0};
+  constexpr DoubleDouble ZERO = {0.0, 0.0};
+  constexpr DoubleDouble MZERO = {-0.0, -0.0};
+  constexpr DoubleDouble PI = {0x1.1a62633145c07p-53, 0x1.921fb54442d18p+1};
+  constexpr DoubleDouble MPI = {-0x1.1a62633145c07p-53, -0x1.921fb54442d18p+1};
+  constexpr DoubleDouble PI_OVER_2 = {0x1.1a62633145c07p-54,
+                                      0x1.921fb54442d18p0};
+  constexpr DoubleDouble MPI_OVER_2 = {-0x1.1a62633145c07p-54,
+                                       -0x1.921fb54442d18p0};
+  constexpr DoubleDouble PI_OVER_4 = {0x1.1a62633145c07p-55,
+                                      0x1.921fb54442d18p-1};
+  constexpr DoubleDouble THREE_PI_OVER_4 = {0x1.a79394c9e8a0ap-54,
+                                            0x1.2d97c7f3321d2p+1};
+  // Adjustment for constant term:
+  //   CONST_ADJ[x_sign][y_sign][recip]
+  constexpr DoubleDouble CONST_ADJ[2][2][2] = {
+      {{ZERO, MPI_OVER_2}, {MZERO, MPI_OVER_2}},
+      {{MPI, PI_OVER_2}, {MPI, PI_OVER_2}}};
+
+  FPBits x_bits(x), y_bits(y);
+  bool x_sign = x_bits.sign().is_neg();
+  bool y_sign = y_bits.sign().is_neg();
+  x_bits = x_bits.abs();
+  y_bits = y_bits.abs();
+  uint64_t x_abs = x_bits.uintval();
+  uint64_t y_abs = y_bits.uintval();
+  bool recip = x_abs < y_abs;
+  uint64_t min_abs = recip ? x_abs : y_abs;
+  uint64_t max_abs = !recip ? x_abs : y_abs;
+  unsigned min_exp = static_cast<unsigned>(min_abs >> FPBits::FRACTION_LEN);
+  unsigned max_exp = static_cast<unsigned>(max_abs >> FPBits::FRACTION_LEN);
+
+  double num = FPBits(min_abs).get_val();
+  double den = FPBits(max_abs).get_val();
+
+  // Check for exceptional cases, whether inputs are 0, inf, nan, or close to
+  // overflow, or close to underflow.
+  if (LIBC_UNLIKELY(max_exp > 0x7ffU - 128U || min_exp < 128U)) {
+    if (x_bits.is_nan() || y_bits.is_nan())
+      return FPBits::quiet_nan().get_val();
+    unsigned x_except = x_abs == 0 ? 0 : (FPBits(x_abs).is_inf() ? 2 : 1);
+    unsigned y_except = y_abs == 0 ? 0 : (FPBits(y_abs).is_inf() ? 2 : 1);
+
+    // Exceptional cases:
+    //   EXCEPT[y_except][x_except][x_is_neg]
+    // with x_except & y_except:
+    //   0: zero
+    //   1: finite, non-zero
+    //   2: infinity
+    constexpr DoubleDouble EXCEPTS[3][3][2] = {
+        {{ZERO, PI}, {ZERO, PI}, {ZERO, PI}},
+        {{PI_OVER_2, PI_OVER_2}, {ZERO, ZERO}, {ZERO, PI}},
+        {{PI_OVER_2, PI_OVER_2},
+         {PI_OVER_2, PI_OVER_2},
+         {PI_OVER_4, THREE_PI_OVER_4}},
+    };
+
+    if ((x_except != 1) || (y_except != 1)) {
+      DoubleDouble r = EXCEPTS[y_except][x_except][x_sign];
+      return fputil::multiply_add(IS_NEG[y_sign], r.hi, IS_NEG[y_sign] * r.lo);
+    }
+    bool scale_up = min_exp < 128U;
+    bool scale_down = max_exp > 0x7ffU - 128U;
+    // At least one input is denormal, multiply both numerator and denominator
+    // by some large enough power of 2 to normalize denormal inputs.
+    if (scale_up) {
+      num *= 0x1.0p64;
+      if (!scale_down)
+        den *= 0x1.0p64;
+    } else if (scale_down) {
+      den *= 0x1.0p-64;
+      if (!scale_up)
+        num *= 0x1.0p-64;
+    }
+
+    min_abs = FPBits(num).uintval();
+    max_abs = FPBits(den).uintval();
+    min_exp = static_cast<unsigned>(min_abs >> FPBits::FRACTION_LEN);
+    max_exp = static_cast<unsigned>(max_abs >> FPBits::FRACTION_LEN);
+  }
+
+  double final_sign = IS_NEG[(x_sign != y_sign) != recip];
+  DoubleDouble const_term = CONST_ADJ[x_sign][y_sign][recip];
+  unsigned exp_diff = max_exp - min_exp;
+  // We have the following bound for normalized n and d:
+  //   2^(-exp_diff - 1) < n/d < 2^(-exp_diff + 1).
+  if (LIBC_UNLIKELY(exp_diff > 54)) {
+    return fputil::multiply_add(final_sign, const_term.hi,
+                                final_sign * (const_term.lo + num / den));
+  }
+
+  double k = fputil::nearest_integer(64.0 * num / den);
+  unsigned idx = static_cast<unsigned>(k);
+  // k = idx / 64
+  k *= 0x1.0p-6;
+
+  // Range reduction:
+  // atan(n/d) - atan(k/64) = atan((n/d - k/64) / (1 + (n/d) * (k/64)))
+  //                        = atan((n - d * k/64)) / (d + n * k/64))
+  DoubleDouble num_k = fputil::exact_mult(num, k);
+  DoubleDouble den_k = fputil::exact_mult(den, k);
+
+  // num_dd = n - d * k
+  DoubleDouble num_dd = fputil::exact_add(num - den_k.hi, -den_k.lo);
+  // den_dd = d + n * k
+  DoubleDouble den_dd = fputil::exact_add(den, num_k.hi);
+  den_dd.lo += num_k.lo;
+
+  // q = (n - d * k) / (d + n * k)
+  DoubleDouble q = fputil::div(num_dd, den_dd);
+  // p ~ atan(q)
+  DoubleDouble p = atan_eval(q);
+
+  DoubleDouble r = fputil::add(const_term, fputil::add(ATAN_I[idx], p));
+  r.hi *= final_sign;
+  r.lo *= final_sign;
+
+  return r.hi + r.lo;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/startup/gpu/CMakeLists.txt b/libc/startup/gpu/CMakeLists.txt
index 6f67fa9..3830bf3 100644
--- a/libc/startup/gpu/CMakeLists.txt
+++ b/libc/startup/gpu/CMakeLists.txt
@@ -26,6 +26,16 @@ function(add_startup_object name)
     PROPERTIES
       OUTPUT_NAME ${name}.o
   )
+
+  # Make an executable target of relocatable bitcode for clang if needed.
+  if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR)
+    add_executable(${fq_target_name}.exe $<TARGET_OBJECTS:${fq_target_name}>)
+    set_target_properties(${fq_target_name}.exe PROPERTIES
+      RUNTIME_OUTPUT_DIRECTORY ${LIBC_LIBRARY_DIR}
+      RUNTIME_OUTPUT_NAME ${name}.o)
+    target_link_options(${fq_target_name}.exe PRIVATE
+                        "-nostdlib" "-flto" "-Wl,--lto-emit-llvm" "-march= ")
+  endif()
 endfunction()
 
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 3ad5d98..380d283 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -2045,6 +2045,18 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  atan2_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    atan2_test.cpp
+  DEPENDS
+    libc.src.math.atan2
+    libc.src.__support.FPUtil.fp_bits
+)
+
+add_fp_unittest(
   f16add_test
   NEED_MPFR
   SUITE
diff --git a/libc/test/src/math/atan2_test.cpp b/libc/test/src/math/atan2_test.cpp
new file mode 100644
index 0000000..637ac89
--- /dev/null
+++ b/libc/test/src/math/atan2_test.cpp
@@ -0,0 +1,125 @@
+//===-- Unittests for atan2 -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/math/atan2.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcAtan2Test = LIBC_NAMESPACE::testing::FPTest<double>;
+using LIBC_NAMESPACE::testing::tlog;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+TEST_F(LlvmLibcAtan2Test, TrickyInputs) {
+  mpfr::BinaryInput<double> inputs[] = {
+      {0x1.0853408534085p-2, 0x1.e7b54166c6126p-2},
+      {FPBits::inf().get_val(), 0x0.0000000000001p-1022},
+  };
+
+  for (mpfr::BinaryInput<double> &input : inputs) {
+    double x = input.x;
+    double y = input.y;
+    mpfr::RoundingMode rm = mpfr::RoundingMode::Downward;
+    mpfr::ForceRoundingMode rr(rm);
+    ASSERT_MPFR_MATCH(mpfr::Operation::Atan2, input,
+                      LIBC_NAMESPACE::atan2(x, y), 0.5, rm);
+    input.x = -input.x;
+    ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan2, input,
+                                   LIBC_NAMESPACE::atan2(-x, y), 0.5);
+    input.y = -input.y;
+    ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan2, input,
+                                   LIBC_NAMESPACE::atan2(-x, -y), 0.5);
+    input.x = -input.x;
+    ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan2, input,
+                                   LIBC_NAMESPACE::atan2(x, -y), 0.5);
+  }
+}
+
+TEST_F(LlvmLibcAtan2Test, InDoubleRange) {
+  constexpr uint64_t X_COUNT = 123;
+  constexpr uint64_t X_START = FPBits(0.25).uintval();
+  constexpr uint64_t X_STOP = FPBits(4.0).uintval();
+  constexpr uint64_t X_STEP = (X_STOP - X_START) / X_COUNT;
+
+  constexpr uint64_t Y_COUNT = 137;
+  constexpr uint64_t Y_START = FPBits(0.25).uintval();
+  constexpr uint64_t Y_STOP = FPBits(4.0).uintval();
+  constexpr uint64_t Y_STEP = (Y_STOP - Y_START) / Y_COUNT;
+
+  auto test = [&](mpfr::RoundingMode rounding_mode) {
+    mpfr::ForceRoundingMode __r(rounding_mode);
+    if (!__r.success)
+      return;
+
+    uint64_t fails = 0;
+    uint64_t finite_count = 0;
+    uint64_t total_count = 0;
+    double failed_x = 0.0, failed_y = 0.0, failed_r = 0.0;
+    double tol = 0.5;
+
+    for (uint64_t i = 0, v = X_START; i <= X_COUNT; ++i, v += X_STEP) {
+      double x = FPBits(v).get_val();
+      if (FPBits(x).is_inf_or_nan() || x < 0.0)
+        continue;
+
+      for (uint64_t j = 0, w = Y_START; j <= Y_COUNT; ++j, w += Y_STEP) {
+        double y = FPBits(w).get_val();
+        if (FPBits(y).is_inf_or_nan())
+          continue;
+
+        double result = LIBC_NAMESPACE::atan2(x, y);
+        ++total_count;
+        if (FPBits(result).is_inf_or_nan())
+          continue;
+
+        ++finite_count;
+        mpfr::BinaryInput<double> inputs{x, y};
+
+        if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Atan2, inputs,
+                                               result, 0.5, rounding_mode)) {
+          ++fails;
+          while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(
+              mpfr::Operation::Atan2, inputs, result, tol, rounding_mode)) {
+            failed_x = x;
+            failed_y = y;
+            failed_r = result;
+
+            if (tol > 1000.0)
+              break;
+
+            tol *= 2.0;
+          }
+        }
+      }
+    }
+    if (fails || (finite_count < total_count)) {
+      tlog << " Atan2 failed: " << fails << "/" << finite_count << "/"
+           << total_count << " tests.\n"
+           << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
+    }
+    if (fails) {
+      mpfr::BinaryInput<double> inputs{failed_x, failed_y};
+      EXPECT_MPFR_MATCH(mpfr::Operation::Atan2, inputs, failed_r, 0.5,
+                        rounding_mode);
+    }
+  };
+
+  tlog << " Test Rounding To Nearest...\n";
+  test(mpfr::RoundingMode::Nearest);
+
+  tlog << " Test Rounding Downward...\n";
+  test(mpfr::RoundingMode::Downward);
+
+  tlog << " Test Rounding Upward...\n";
+  test(mpfr::RoundingMode::Upward);
+
+  tlog << " Test Rounding Toward Zero...\n";
+  test(mpfr::RoundingMode::TowardZero);
+}
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 5ddc88a3..8b29423 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -3532,6 +3532,16 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  atan2_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    atan2_test.cpp
+  DEPENDS
+    libc.src.math.atan2
+)
+
+add_fp_unittest(
   scalblnf16_test
   SUITE
     libc-math-smoke-tests
diff --git a/libc/test/src/math/smoke/atan2_test.cpp b/libc/test/src/math/smoke/atan2_test.cpp
new file mode 100644
index 0000000..61dd6ca
--- /dev/null
+++ b/libc/test/src/math/smoke/atan2_test.cpp
@@ -0,0 +1,22 @@
+//===-- Unittests for atan2 -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atan2.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcAtan2Test = LIBC_NAMESPACE::testing::FPTest<double>;
+
+TEST_F(LlvmLibcAtan2Test, SpecialNumbers) {
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atan2(aNaN, zero));
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atan2(1.0, aNaN));
+  EXPECT_FP_EQ_ALL_ROUNDING(0.0, LIBC_NAMESPACE::atan2(zero, zero));
+  EXPECT_FP_EQ_ALL_ROUNDING(-0.0, LIBC_NAMESPACE::atan2(-0.0, zero));
+  EXPECT_FP_EQ_ALL_ROUNDING(0.0, LIBC_NAMESPACE::atan2(1.0, inf));
+  EXPECT_FP_EQ_ALL_ROUNDING(-0.0, LIBC_NAMESPACE::atan2(-1.0, inf));
+}
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 917c6be..0840607 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -310,10 +310,6 @@ endif()
 option(LIBCXX_ENABLE_PEDANTIC "Compile with pedantic enabled." OFF)
 option(LIBCXX_ENABLE_WERROR "Fail and stop if a warning is triggered." OFF)
 
-option(LIBCXX_GENERATE_COVERAGE "Enable generating code coverage." OFF)
-set(LIBCXX_COVERAGE_LIBRARY "" CACHE STRING
-    "The Profile-rt library used to build with code coverage")
-
 set(LIBCXX_HERMETIC_STATIC_LIBRARY_DEFAULT OFF)
 if (WIN32)
   set(LIBCXX_HERMETIC_STATIC_LIBRARY_DEFAULT ON)
@@ -376,12 +372,6 @@ if (NOT LIBCXX_ENABLE_RTTI AND LIBCXX_ENABLE_EXCEPTIONS)
                       " for details.")
 endif()
 
-# Ensure LLVM_USE_SANITIZER is not specified when LIBCXX_GENERATE_COVERAGE
-# is ON.
-if (LLVM_USE_SANITIZER AND LIBCXX_GENERATE_COVERAGE)
-  message(FATAL_ERROR "LLVM_USE_SANITIZER cannot be used with LIBCXX_GENERATE_COVERAGE")
-endif()
-
 if (LIBCXX_ENABLE_ABI_LINKER_SCRIPT)
     if (APPLE)
       message(FATAL_ERROR "LIBCXX_ENABLE_ABI_LINKER_SCRIPT cannot be used on APPLE targets")
@@ -490,12 +480,6 @@ endif()
 # Configure compiler.
 include(config-ix)
 
-# Configure coverage options.
-if (LIBCXX_GENERATE_COVERAGE)
-  include(CodeCoverage)
-  set(CMAKE_BUILD_TYPE "COVERAGE" CACHE STRING "" FORCE)
-endif()
-
 #===============================================================================
 # Setup Compiler Flags
 #===============================================================================
diff --git a/libcxx/cmake/Modules/CodeCoverage.cmake b/libcxx/cmake/Modules/CodeCoverage.cmake
deleted file mode 100644
index 1bd3a78..0000000
--- a/libcxx/cmake/Modules/CodeCoverage.cmake
+++ /dev/null
@@ -1,50 +0,0 @@
-find_program(CODE_COVERAGE_LCOV lcov)
-if (NOT CODE_COVERAGE_LCOV)
-  message(FATAL_ERROR "Cannot find lcov...")
-endif()
-
-find_program(CODE_COVERAGE_LLVM_COV llvm-cov)
-if (NOT CODE_COVERAGE_LLVM_COV)
-  message(FATAL_ERROR "Cannot find llvm-cov...")
-endif()
-
-find_program(CODE_COVERAGE_GENHTML genhtml)
-if (NOT CODE_COVERAGE_GENHTML)
-  message(FATAL_ERROR "Cannot find genhtml...")
-endif()
-
-set(CMAKE_CXX_FLAGS_COVERAGE "-g -O0 --coverage")
-
-function(setup_lcov_test_target_coverage target_name output_dir capture_dirs source_dirs)
-  if (NOT DEFINED LIBCXX_BINARY_DIR)
-    message(FATAL_ERROR "Variable must be set")
-  endif()
-
-  set(GCOV_TOOL "${LIBCXX_BINARY_DIR}/llvm-cov-wrapper")
-  file(GENERATE OUTPUT ${GCOV_TOOL}
-    CONTENT "#!/usr/bin/env bash\n${CODE_COVERAGE_LLVM_COV} gcov \"$@\"\n")
-
-  file(MAKE_DIRECTORY ${output_dir})
-
-  set(CAPTURE_DIRS "")
-  foreach(cdir ${capture_dirs})
-    list(APPEND CAPTURE_DIRS "-d;${cdir}")
-  endforeach()
-
-  set(EXTRACT_DIRS "")
-  foreach(sdir ${source_dirs})
-    list(APPEND EXTRACT_DIRS "'${sdir}/*'")
-  endforeach()
-
-  message(STATUS "Capture Directories: ${CAPTURE_DIRS}")
-  message(STATUS "Extract Directories: ${EXTRACT_DIRS}")
-
-  add_custom_target(generate-lib${target_name}-coverage
-        COMMAND chmod +x ${GCOV_TOOL}
-        COMMAND ${CODE_COVERAGE_LCOV} --gcov-tool ${GCOV_TOOL} --capture ${CAPTURE_DIRS} -o test_coverage.info
-        COMMAND ${CODE_COVERAGE_LCOV} --gcov-tool ${GCOV_TOOL} --extract test_coverage.info ${EXTRACT_DIRS} -o test_coverage.info
-        COMMAND ${CODE_COVERAGE_GENHTML} --demangle-cpp test_coverage.info -o test_coverage
-        COMMAND ${CMAKE_COMMAND} -E remove test_coverage.info
-        WORKING_DIRECTORY ${output_dir}
-        COMMENT "Generating coverage results")
-endfunction()
diff --git a/libcxx/cmake/caches/Generic-no-exceptions.cmake b/libcxx/cmake/caches/Generic-no-exceptions.cmake
index f0dffef..c5b2ffd 100644
--- a/libcxx/cmake/caches/Generic-no-exceptions.cmake
+++ b/libcxx/cmake/caches/Generic-no-exceptions.cmake
@@ -1,2 +1,6 @@
 set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-experimental.cmake b/libcxx/cmake/caches/Generic-no-experimental.cmake
index f33ed01..f68b265 100644
--- a/libcxx/cmake/caches/Generic-no-experimental.cmake
+++ b/libcxx/cmake/caches/Generic-no-experimental.cmake
@@ -1,2 +1,6 @@
 set(LIBCXX_TEST_PARAMS "enable_experimental=False" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-filesystem.cmake b/libcxx/cmake/caches/Generic-no-filesystem.cmake
index 4000f3a..57b8d9f 100644
--- a/libcxx/cmake/caches/Generic-no-filesystem.cmake
+++ b/libcxx/cmake/caches/Generic-no-filesystem.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-localization.cmake b/libcxx/cmake/caches/Generic-no-localization.cmake
index 79d6b44..d3150ec 100644
--- a/libcxx/cmake/caches/Generic-no-localization.cmake
+++ b/libcxx/cmake/caches/Generic-no-localization.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_LOCALIZATION OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-random_device.cmake b/libcxx/cmake/caches/Generic-no-random_device.cmake
index e9b4cc6..8cd1027 100644
--- a/libcxx/cmake/caches/Generic-no-random_device.cmake
+++ b/libcxx/cmake/caches/Generic-no-random_device.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-rtti.cmake b/libcxx/cmake/caches/Generic-no-rtti.cmake
index c62ddce..d080360 100644
--- a/libcxx/cmake/caches/Generic-no-rtti.cmake
+++ b/libcxx/cmake/caches/Generic-no-rtti.cmake
@@ -2,3 +2,7 @@ set(LIBCXX_ENABLE_RTTI OFF CACHE BOOL "")
 set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_RTTI OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-threads.cmake b/libcxx/cmake/caches/Generic-no-threads.cmake
index 616baef..81c92fc 100644
--- a/libcxx/cmake/caches/Generic-no-threads.cmake
+++ b/libcxx/cmake/caches/Generic-no-threads.cmake
@@ -1,3 +1,7 @@
 set(LIBCXX_ENABLE_THREADS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_THREADS OFF CACHE BOOL "")
 set(LIBCXX_ENABLE_MONOTONIC_CLOCK OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-tzdb.cmake b/libcxx/cmake/caches/Generic-no-tzdb.cmake
index 27c826ed..afe1c8a 100644
--- a/libcxx/cmake/caches/Generic-no-tzdb.cmake
+++ b/libcxx/cmake/caches/Generic-no-tzdb.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_TIME_ZONE_DATABASE OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-unicode.cmake b/libcxx/cmake/caches/Generic-no-unicode.cmake
index 01160bf2..27fbc33 100644
--- a/libcxx/cmake/caches/Generic-no-unicode.cmake
+++ b/libcxx/cmake/caches/Generic-no-unicode.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_UNICODE OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-wide-characters.cmake b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
index 728d410..72c3045 100644
--- a/libcxx/cmake/caches/Generic-no-wide-characters.cmake
+++ b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_WIDE_CHARACTERS OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index 55a0f85..97ecf5e 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -78,7 +78,7 @@
 "`2139 <https://wg21.link/LWG2139>`__","What is a user-defined type?","Rapperswil","",""
 "`2970 <https://wg21.link/LWG2970>`__","Return type of std::visit misspecified","Rapperswil","|Complete|","11.0"
 "`3058 <https://wg21.link/LWG3058>`__","Parallel adjacent_difference shouldn't require creating temporaries","Rapperswil","",""
-"`3062 <https://wg21.link/LWG3062>`__","Unnecessary decay_t in is_execution_policy_v should be remove_cvref_t","Rapperswil","",""
+"`3062 <https://wg21.link/LWG3062>`__","Unnecessary decay_t in is_execution_policy_v should be remove_cvref_t","Rapperswil","|Complete|","17.0"
 "`3067 <https://wg21.link/LWG3067>`__","recursive_directory_iterator::pop must invalidate","Rapperswil","|Nothing To Do|",""
 "`3071 <https://wg21.link/LWG3071>`__","[networking.ts] read_until still refers to ""input sequence""","Rapperswil","|Nothing To Do|",""
 "`3074 <https://wg21.link/LWG3074>`__","Non-member functions for valarray should only deduce from the valarray","Rapperswil","",""
@@ -124,7 +124,7 @@
 "`3137 <https://wg21.link/LWG3137>`__","Header for ``__cpp_lib_to_chars``\ ","San Diego","|Complete|",""
 "`3140 <https://wg21.link/LWG3140>`__","``COMMON_REF``\  is unimplementable as specified","San Diego","|Nothing To Do|",""
 "`3145 <https://wg21.link/LWG3145>`__","``file_clock``\  breaks ABI for C++17 implementations","San Diego","|Complete|",""
-"`3147 <https://wg21.link/LWG3147>`__","Definitions of ""likely"" and ""unlikely"" are likely to cause problems","San Diego","",""
+"`3147 <https://wg21.link/LWG3147>`__","Definitions of ""likely"" and ""unlikely"" are likely to cause problems","San Diego","|Nothing To Do|",""
 "`3148 <https://wg21.link/LWG3148>`__","``<concepts>``\  should be freestanding","San Diego","",""
 "`3153 <https://wg21.link/LWG3153>`__","``Common``\  and ``common_type``\  have too little in common","San Diego","|Complete|","13.0"
 "`3154 <https://wg21.link/LWG3154>`__","``Common``\  and ``CommonReference``\  have a common defect","San Diego","|Nothing To Do|",""
@@ -155,7 +155,7 @@
 "`3191 <https://wg21.link/LWG3191>`__","``std::ranges::shuffle``\  synopsis does not match algorithm definition","Cologne","|Complete|","15.0","|ranges|"
 "`3196 <https://wg21.link/LWG3196>`__","``std::optional<T>``\  is ill-formed is ``T``\  is an array","Cologne","|Complete|",""
 "`3198 <https://wg21.link/LWG3198>`__","Bad constraint on ``std::span::span()``\ ","Cologne","|Complete|",""
-"`3199 <https://wg21.link/LWG3199>`__","``istream >> bitset<0>``\  fails","Cologne","",""
+"`3199 <https://wg21.link/LWG3199>`__","``istream >> bitset<0>``\  fails","Cologne","|Complete|","10.0"
 "`3202 <https://wg21.link/LWG3202>`__","P0318R1 was supposed to be revised","Cologne","|Complete|",""
 "`3206 <https://wg21.link/LWG3206>`__","``year_month_day``\  conversion to ``sys_days``\  uses not-existing member function","Cologne","|Complete|",""
 "`3208 <https://wg21.link/LWG3208>`__","``Boolean``\ 's expression requirements are ordered inconsistently","Cologne","|Nothing To Do|",""
@@ -249,7 +249,7 @@
 "`3325 <https://wg21.link/LWG3325>`__","Constrain return type of transformation function for ``transform_view``\ ","Prague","|Complete|","15.0","|ranges|"
 "`3326 <https://wg21.link/LWG3326>`__","``enable_view``\  has false positives","Prague","|Complete|","15.0","|ranges|"
 "`3327 <https://wg21.link/LWG3327>`__","Format alignment specifiers vs. text direction","Prague","|Nothing To Do|","","|format|"
-"`3328 <https://wg21.link/LWG3328>`__","Clarify that ``std::string``\  is not good for UTF-8","Prague","",""
+"`3328 <https://wg21.link/LWG3328>`__","Clarify that ``std::string``\  is not good for UTF-8","Prague","|Nothing To Do|",""
 "`3329 <https://wg21.link/LWG3329>`__","``totally_ordered_with``\  both directly and indirectly requires ``common_reference_with``\ ","Prague","|Complete|","13.0"
 "`3330 <https://wg21.link/LWG3330>`__","Include ``<compare>``\  from most library headers","Prague","|Complete|","13.0","|spaceship|"
 "`3331 <https://wg21.link/LWG3331>`__","Define ``totally_ordered/_with``\  in terms of ``partially-ordered-with``\ ","Prague","|Complete|","13.0"
@@ -271,7 +271,7 @@
 "`3358 <https://wg21.link/LWG3358>`__","|sect|\ [span.cons] is mistaken that ``to_address``\  can throw","Prague","|Complete|","17.0"
 "`3359 <https://wg21.link/LWG3359>`__","``<chrono>``\  leap second support should allow for negative leap seconds","Prague","|In Progress|","","|chrono|"
 "`3360 <https://wg21.link/LWG3360>`__","``three_way_comparable_with``\  is inconsistent with similar concepts","Prague","|Nothing To Do|","","|spaceship|"
-"`3362 <https://wg21.link/LWG3362>`__","Strike ``stop_source``\ 's ``operator!=``\ ","Prague","",""
+"`3362 <https://wg21.link/LWG3362>`__","Strike ``stop_source``\ 's ``operator!=``\ ","Prague","|Complete|","17.0"
 "`3363 <https://wg21.link/LWG3363>`__","``drop_while_view``\  should opt-out of ``sized_range``\ ","Prague","|Nothing To Do|","","|ranges|"
 "`3364 <https://wg21.link/LWG3364>`__","Initialize data members of ranges and their iterators","Prague","|Complete|","16.0","|ranges|"
 "`3367 <https://wg21.link/LWG3367>`__","Integer-class conversions should not throw","Prague","|Nothing To Do|",""
diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv
index 0e466086..e1a5aa6 100644
--- a/libcxx/docs/Status/Cxx23Issues.csv
+++ b/libcxx/docs/Status/Cxx23Issues.csv
@@ -145,7 +145,7 @@
 "`3607 <https://wg21.link/LWG3607>`__","``contiguous_iterator`` should not be allowed to have custom ``iter_move`` and ``iter_swap`` behavior","February 2022","|Nothing to do|","","|ranges|"
 "`3610 <https://wg21.link/LWG3610>`__","``iota_view::size`` sometimes rejects integer-class types","February 2022","","","|ranges|"
 "`3612 <https://wg21.link/LWG3612>`__","Inconsistent pointer alignment in ``std::format`` ","February 2022","|Complete|","14.0","|format|"
-"`3616 <https://wg21.link/LWG3616>`__","LWG 3498 seems to miss the non-member ``swap`` for ``basic_syncbuf`` ","February 2022","",""
+"`3616 <https://wg21.link/LWG3616>`__","LWG 3498 seems to miss the non-member ``swap`` for ``basic_syncbuf`` ","February 2022","|Complete|","18.0"
 "`3618 <https://wg21.link/LWG3618>`__","Unnecessary ``iter_move`` for ``transform_view::iterator`` ","February 2022","|Complete|","19.0","|ranges|"
 "`3619 <https://wg21.link/LWG3619>`__","Specification of ``vformat_to`` contains ill-formed ``formatted_size`` calls","February 2022","|Nothing to do|","","|format|"
 "`3621 <https://wg21.link/LWG3621>`__","Remove feature-test macro ``__cpp_lib_monadic_optional`` ","February 2022","|Complete|","15.0"
@@ -180,7 +180,7 @@
 "`3710 <https://wg21.link/LWG3710>`__","The ``end`` of ``chunk_view`` for input ranges can be ``const``","July 2022","","","|ranges|"
 "`3711 <https://wg21.link/LWG3711>`__","Missing preconditions for slide_view constructor","July 2022","","","|ranges|"
 "`3712 <https://wg21.link/LWG3712>`__","``chunk_view`` and ``slide_view`` should not be ``default_initializable``","July 2022","","","|ranges|"
-"`3713 <https://wg21.link/LWG3713>`__","Sorted with respect to comparator (only)","July 2022","",""
+"`3713 <https://wg21.link/LWG3713>`__","Sorted with respect to comparator (only)","July 2022","|Nothing To Do|",""
 "`3715 <https://wg21.link/LWG3715>`__","``view_interface::empty`` is overconstrained","July 2022","|Complete|","19.0","|ranges|"
 "`3719 <https://wg21.link/LWG3719>`__","Directory iterators should be usable with default sentinel","July 2022","|Complete|","17.0","|ranges|"
 "`3721 <https://wg21.link/LWG3721>`__","Allow an ``arg-id`` with a value of zero for ``width`` in ``std-format-spec``","July 2022","|Complete|","16.0","|format|"
@@ -228,7 +228,7 @@
 "`3778 <https://wg21.link/LWG3778>`__","``vector<bool>`` missing exception specifications", "November 2022","|Complete|","3.7",""
 "`3781 <https://wg21.link/LWG3781>`__","The exposition-only alias templates ``cont-key-type`` and ``cont-mapped-type`` should be removed", "November 2022","|Nothing to do|","",""
 "`3782 <https://wg21.link/LWG3782>`__","Should ``<math.h>`` declare ``::lerp``?", "November 2022","|Complete|","17.0",""
-"`3784 <https://wg21.link/LWG3784>`__","std.compat should not provide ``::byte`` and its friends", "November 2022","","",""
+"`3784 <https://wg21.link/LWG3784>`__","std.compat should not provide ``::byte`` and its friends", "November 2022","|Complete|","19.0",""
 "`3785 <https://wg21.link/LWG3785>`__","``ranges::to`` is over-constrained on the destination type being a range", "November 2022","","","|ranges|"
 "`3788 <https://wg21.link/LWG3788>`__","``jthread::operator=(jthread&&)`` postconditions are unimplementable under self-assignment", "November 2022","","",""
 "`3792 <https://wg21.link/LWG3792>`__","``__cpp_lib_constexpr_algorithms`` should also be defined in ``<utility>``", "November 2022","|Complete|","16.0",""
@@ -241,9 +241,9 @@
 "`3817 <https://wg21.link/LWG3817>`__","Missing preconditions on ``forward_list`` modifiers", "November 2022","","",""
 "`3818 <https://wg21.link/LWG3818>`__","Exposition-only concepts are not described in library intro", "November 2022","|Nothing to do|","",""
 "`3822 <https://wg21.link/LWG3822>`__","Avoiding normalization in ``filesystem::weakly_canonical``", "November 2022","","",""
-"`3823 <https://wg21.link/LWG3823>`__","Unnecessary precondition for ``is_aggregate``", "November 2022","","",""
+"`3823 <https://wg21.link/LWG3823>`__","Unnecessary precondition for ``is_aggregate``", "November 2022","|Nothing To Do|","",""
 "`3824 <https://wg21.link/LWG3824>`__","Number of ``bind`` placeholders is underspecified", "November 2022","|Nothing to do|","",""
-"`3826 <https://wg21.link/LWG3826>`__","Redundant specification [for overload of yield_value]", "November 2022","","",""
+"`3826 <https://wg21.link/LWG3826>`__","Redundant specification [for overload of yield_value]", "November 2022","|Nothing To Do|","",""
 "","","","","",""
 "`2195 <https://wg21.link/LWG2195>`__","Missing constructors for ``match_results``","February 2023","","",""
 "`2295 <https://wg21.link/LWG2295>`__","Locale name when the provided ``Facet`` is a ``nullptr``","February 2023","","",""
@@ -288,7 +288,7 @@
 "`3803 <https://wg21.link/LWG3803>`__","``flat_foo`` constructors taking ``KeyContainer`` lack ``KeyCompare`` parameter","February 2023","","",""
 "`3810 <https://wg21.link/LWG3810>`__","CTAD for ``std::basic_format_args``","February 2023","|Complete|","17.0","|format|"
 "`3827 <https://wg21.link/LWG3827>`__","Deprecate ``<stdalign.h>`` and ``<stdbool.h>`` macros","February 2023","","",""
-"`3828 <https://wg21.link/LWG3828>`__","Sync ``intmax_t`` and ``uintmax_t`` with C2x","February 2023","","",""
+"`3828 <https://wg21.link/LWG3828>`__","Sync ``intmax_t`` and ``uintmax_t`` with C2x","February 2023","|Nothing To Do|","",""
 "`3833 <https://wg21.link/LWG3833>`__","Remove specialization ``template<size_t N> struct formatter<const charT[N], charT>``","February 2023","|Complete|","17.0","|format|"
 "`3836 <https://wg21.link/LWG3836>`__","``std::expected<bool, E1>`` conversion constructor ``expected(const expected<U, G>&)`` should take precedence over ``expected(U&&)`` with operator ``bool``","February 2023","|Complete|","18.0",""
 "`3843 <https://wg21.link/LWG3843>`__","``std::expected<T,E>::value() &`` assumes ``E`` is copy constructible","February 2023","|Complete|","17.0",""
diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index dec9af1..29eb163 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -18,7 +18,7 @@
 "`3940 <https://wg21.link/LWG3940>`__","``std::expected<void, E>::value()`` also needs ``E`` to be copy constructible","Varna June 2023","|Complete|","18.0",""
 "","","","","",""
 "`2392 <https://wg21.link/LWG2392>`__","""character type"" is used but not defined","Kona November 2023","","",""
-"`3203 <https://wg21.link/LWG3203>`__","``span`` element access invalidation","Kona November 2023","","",""
+"`3203 <https://wg21.link/LWG3203>`__","``span`` element access invalidation","Kona November 2023","|Nothing To Do|","",""
 "`3305 <https://wg21.link/LWG3305>`__","``any_cast<void>``","Kona November 2023","|Complete|","18.0",""
 "`3431 <https://wg21.link/LWG3431>`__","``<=>`` for containers should require ``three_way_comparable<T>`` instead of ``<=>``","Kona November 2023","","",""
 "`3749 <https://wg21.link/LWG3749>`__","``common_iterator`` should handle integer-class difference types","Kona November 2023","","",""
diff --git a/libcxx/include/__thread/thread.h b/libcxx/include/__thread/thread.h
index d2254a6..458c1cd 100644
--- a/libcxx/include/__thread/thread.h
+++ b/libcxx/include/__thread/thread.h
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP___THREAD_THREAD_H
 #define _LIBCPP___THREAD_THREAD_H
 
+#include <__assert>
 #include <__condition_variable/condition_variable.h>
 #include <__config>
 #include <__exception/terminate.h>
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index bfc88c4..fe9d2666 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -147,11 +147,6 @@ if(NOT LIBCXX_INSTALL_LIBRARY)
   set(exclude_from_all EXCLUDE_FROM_ALL)
 endif()
 
-if (LIBCXX_GENERATE_COVERAGE AND NOT LIBCXX_COVERAGE_LIBRARY)
-  find_compiler_rt_library(profile LIBCXX_COVERAGE_LIBRARY)
-endif()
-add_library_flags_if(LIBCXX_COVERAGE_LIBRARY "${LIBCXX_COVERAGE_LIBRARY}")
-
 if (APPLE AND LLVM_USE_SANITIZER)
   if (("${LLVM_USE_SANITIZER}" STREQUAL "Address") OR
       ("${LLVM_USE_SANITIZER}" STREQUAL "Address;Undefined") OR
diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index 001b29e..7440ffa 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -1,12 +1,6 @@
 include(HandleLitArguments)
 add_subdirectory(tools)
 
-# By default, libcxx and libcxxabi share a library directory.
-if (NOT LIBCXX_CXX_ABI_LIBRARY_PATH)
-  set(LIBCXX_CXX_ABI_LIBRARY_PATH "${LIBCXX_LIBRARY_DIR}" CACHE PATH
-      "The path to libc++abi library.")
-endif()
-
 set(AUTO_GEN_COMMENT "## Autogenerated by libcxx configuration.\n# Do not edit!")
 set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n")
 
@@ -49,15 +43,3 @@ add_lit_testsuite(check-cxx
   "Running libcxx tests"
   ${CMAKE_CURRENT_BINARY_DIR}
   DEPENDS cxx-test-depends)
-
-if (LIBCXX_GENERATE_COVERAGE)
-  include(CodeCoverage)
-  set(output_dir "${CMAKE_CURRENT_BINARY_DIR}/coverage")
-  set(capture_dirs
-      "${LIBCXX_LIB_CMAKEFILES_DIR}/cxx_objects.dir/"
-      "${LIBCXX_LIB_CMAKEFILES_DIR}/cxx.dir/"
-      "${LIBCXX_LIB_CMAKEFILES_DIR}/cxx_experimental.dir/"
-      "${CMAKE_CURRENT_BINARY_DIR}")
-  set(extract_dirs "${LIBCXX_SOURCE_DIR}/include;${LIBCXX_SOURCE_DIR}/src")
-  setup_lcov_test_target_coverage("cxx" "${output_dir}" "${capture_dirs}" "${extract_dirs}")
-endif()
diff --git a/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp b/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp
index 86e0cba..4fc453f 100644
--- a/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp
@@ -12,6 +12,7 @@
 #include <atomic>
 #include <cassert>
 #include <concepts>
+#include <cstddef>
 
 template <typename T>
 constexpr void check_required_alignment() {
diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index 9466e8b..db0bc6c 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -511,6 +511,12 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
     return R_TLSDESC;
   case R_LARCH_TLS_DESC_CALL:
     return R_TLSDESC_CALL;
+  case R_LARCH_TLS_LD_PCREL20_S2:
+    return R_TLSLD_PC;
+  case R_LARCH_TLS_GD_PCREL20_S2:
+    return R_TLSGD_PC;
+  case R_LARCH_TLS_DESC_PCREL20_S2:
+    return R_TLSDESC_PC;
 
   // Other known relocs that are explicitly unimplemented:
   //
@@ -557,7 +563,11 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
     write64le(loc, val);
     return;
 
+  // Relocs intended for `pcaddi`.
   case R_LARCH_PCREL20_S2:
+  case R_LARCH_TLS_LD_PCREL20_S2:
+  case R_LARCH_TLS_GD_PCREL20_S2:
+  case R_LARCH_TLS_DESC_PCREL20_S2:
     checkInt(loc, val, 22, rel);
     checkAlignment(loc, val, 4, rel);
     write32le(loc, setJ20(read32le(loc), val >> 2));
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 36857d7..6ad5c3b 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1308,7 +1308,8 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
   // LoongArch does not yet implement transition from TLSDESC to LE/IE, so
   // generate TLSDESC dynamic relocation for the dynamic linker to handle.
   if (config->emachine == EM_LOONGARCH &&
-      oneof<R_LOONGARCH_TLSDESC_PAGE_PC, R_TLSDESC, R_TLSDESC_CALL>(expr)) {
+      oneof<R_LOONGARCH_TLSDESC_PAGE_PC, R_TLSDESC, R_TLSDESC_PC,
+            R_TLSDESC_CALL>(expr)) {
     if (expr != R_TLSDESC_CALL) {
       sym.setFlags(NEEDS_TLSDESC);
       c.addReloc({expr, type, offset, addend, &sym});
diff --git a/lld/test/ELF/loongarch-tls-gd-pcrel20-s2.s b/lld/test/ELF/loongarch-tls-gd-pcrel20-s2.s
new file mode 100644
index 0000000..d4d12b9
--- /dev/null
+++ b/lld/test/ELF/loongarch-tls-gd-pcrel20-s2.s
@@ -0,0 +1,129 @@
+# REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 %t/a.s -o %t/a.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 %t/bc.s -o %t/bc.32.o
+# RUN: ld.lld -shared -soname=bc.so %t/bc.32.o -o %t/bc.32.so
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 %t/tga.s -o %t/tga.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 %t/a.s -o %t/a.64.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 %t/bc.s -o %t/bc.64.o
+# RUN: ld.lld -shared -soname=bc.so %t/bc.64.o -o %t/bc.64.so
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 %t/tga.s -o %t/tga.64.o
+
+## LA32 GD
+# RUN: ld.lld -shared %t/a.32.o %t/bc.32.o -o %t/gd.32.so
+# RUN: llvm-readobj -r %t/gd.32.so | FileCheck --check-prefix=GD32-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/gd.32.so | FileCheck --check-prefix=GD32 %s
+
+## LA32 GD -> LE
+# RUN: ld.lld %t/a.32.o %t/bc.32.o %t/tga.32.o -o %t/le.32
+# RUN: llvm-readelf -r %t/le.32 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le.32 | FileCheck --check-prefix=LE32-GOT %s
+# RUN: ld.lld -pie %t/a.32.o %t/bc.32.o %t/tga.32.o -o %t/le-pie.32
+# RUN: llvm-readelf -r %t/le-pie.32 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le-pie.32 | FileCheck --check-prefix=LE32-GOT %s
+
+## LA32 GD -> IE
+# RUN: ld.lld %t/a.32.o %t/bc.32.so %t/tga.32.o -o %t/ie.32
+# RUN: llvm-readobj -r %t/ie.32 | FileCheck --check-prefix=IE32-REL %s
+# RUN: llvm-readelf -x .got %t/ie.32 | FileCheck --check-prefix=IE32-GOT %s
+
+## LA64 GD
+# RUN: ld.lld -shared %t/a.64.o %t/bc.64.o -o %t/gd.64.so
+# RUN: llvm-readobj -r %t/gd.64.so | FileCheck --check-prefix=GD64-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/gd.64.so | FileCheck --check-prefix=GD64 %s
+
+## LA64 GD -> LE
+# RUN: ld.lld %t/a.64.o %t/bc.64.o %t/tga.64.o -o %t/le.64
+# RUN: llvm-readelf -r %t/le.64 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le.64 | FileCheck --check-prefix=LE64-GOT %s
+# RUN: ld.lld -pie %t/a.64.o %t/bc.64.o %t/tga.64.o -o %t/le-pie.64
+# RUN: llvm-readelf -r %t/le-pie.64 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le-pie.64 | FileCheck --check-prefix=LE64-GOT %s
+
+## LA64 GD -> IE
+# RUN: ld.lld %t/a.64.o %t/bc.64.so %t/tga.64.o -o %t/ie.64
+# RUN: llvm-readobj -r %t/ie.64 | FileCheck --check-prefix=IE64-REL %s
+# RUN: llvm-readelf -x .got %t/ie.64 | FileCheck --check-prefix=IE64-GOT %s
+
+# GD32-REL:      .rela.dyn {
+# GD32-REL-NEXT:   0x20300 R_LARCH_TLS_DTPMOD32 a 0x0
+# GD32-REL-NEXT:   0x20304 R_LARCH_TLS_DTPREL32 a 0x0
+# GD32-REL-NEXT:   0x20308 R_LARCH_TLS_DTPMOD32 b 0x0
+# GD32-REL-NEXT:   0x2030C R_LARCH_TLS_DTPREL32 b 0x0
+# GD32-REL-NEXT: }
+
+## &DTPMOD(a) - . = 0x20300 - 0x10250 = 16428<<2
+# GD32:      10250: pcaddi $a0, 16428
+# GD32-NEXT:        bl 44
+
+## &DTPMOD(b) - . = 0x20308 - 0x10258 = 16428<<2
+# GD32:      10258: pcaddi $a0, 16428
+# GD32-NEXT:        bl 36
+
+# GD64-REL:      .rela.dyn {
+# GD64-REL-NEXT:   0x204C0 R_LARCH_TLS_DTPMOD64 a 0x0
+# GD64-REL-NEXT:   0x204C8 R_LARCH_TLS_DTPREL64 a 0x0
+# GD64-REL-NEXT:   0x204D0 R_LARCH_TLS_DTPMOD64 b 0x0
+# GD64-REL-NEXT:   0x204D8 R_LARCH_TLS_DTPREL64 b 0x0
+# GD64-REL-NEXT: }
+
+## &DTPMOD(a) - . = 0x204c0 - 0x10398 = 16458<<2
+# GD64:      10398: pcaddi $a0, 16458
+# GD64-NEXT:        bl 52
+
+## &DTPMOD(b) - . = 0x204d0 - 0x103a4 = 16460<<2
+# GD64:      103a0: pcaddi $a0, 16460
+# GD64-NEXT:        bl 44
+
+# NOREL: no relocations
+
+## .got contains pre-populated values: [a@dtpmod, a@dtprel, b@dtpmod, b@dtprel]
+## a@dtprel = st_value(a) = 0x8
+## b@dtprel = st_value(b) = 0xc
+# LE32-GOT: section '.got':
+# LE32-GOT-NEXT: 0x[[#%x,A:]] 01000000 08000000 01000000 0c000000
+# LE64-GOT: section '.got':
+# LE64-GOT-NEXT: 0x[[#%x,A:]] 01000000 00000000 08000000 00000000
+# LE64-GOT-NEXT: 0x[[#%x,A:]] 01000000 00000000 0c000000 00000000
+
+## a is local - relaxed to LE - its DTPMOD/DTPREL slots are link-time constants.
+## b is external - DTPMOD/DTPREL dynamic relocations are required.
+# IE32-REL:      .rela.dyn {
+# IE32-REL-NEXT:   0x30220 R_LARCH_TLS_DTPMOD32 b 0x0
+# IE32-REL-NEXT:   0x30224 R_LARCH_TLS_DTPREL32 b 0x0
+# IE32-REL-NEXT: }
+# IE32-GOT:      section '.got':
+# IE32-GOT-NEXT: 0x00030218 01000000 08000000 00000000 00000000
+
+# IE64-REL:      .rela.dyn {
+# IE64-REL-NEXT:   0x30380 R_LARCH_TLS_DTPMOD64 b 0x0
+# IE64-REL-NEXT:   0x30388 R_LARCH_TLS_DTPREL64 b 0x0
+# IE64-REL-NEXT: }
+# IE64-GOT:      section '.got':
+# IE64-GOT-NEXT: 0x00030370 01000000 00000000 08000000 00000000
+# IE64-GOT-NEXT: 0x00030380 00000000 00000000 00000000 00000000
+
+#--- a.s
+pcaddi $a0, %gd_pcrel_20(a)
+bl %plt(__tls_get_addr)
+
+pcaddi $a0, %gd_pcrel_20(b)
+bl %plt(__tls_get_addr)
+
+.section .tbss,"awT",@nobits
+.globl a
+.zero 8
+a:
+.zero 4
+
+#--- bc.s
+.section .tbss,"awT",@nobits
+.globl b, c
+b:
+.zero 4
+c:
+
+#--- tga.s
+.globl __tls_get_addr
+__tls_get_addr:
diff --git a/lld/test/ELF/loongarch-tls-ld-pcrel20-s2.s b/lld/test/ELF/loongarch-tls-ld-pcrel20-s2.s
new file mode 100644
index 0000000..70186f5
--- /dev/null
+++ b/lld/test/ELF/loongarch-tls-ld-pcrel20-s2.s
@@ -0,0 +1,82 @@
+# REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 --position-independent %t/a.s -o %t/a.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 %t/tga.s -o %t/tga.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --position-independent %t/a.s -o %t/a.64.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 %t/tga.s -o %t/tga.64.o
+
+## LA32 LD
+# RUN: ld.lld -shared %t/a.32.o -o %t/ld.32.so
+# RUN: llvm-readobj -r %t/ld.32.so | FileCheck --check-prefix=LD32-REL %s
+# RUN: llvm-readelf -x .got %t/ld.32.so | FileCheck --check-prefix=LD32-GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/ld.32.so | FileCheck --check-prefixes=LD32 %s
+
+## LA32 LD -> LE
+# RUN: ld.lld %t/a.32.o %t/tga.32.o -o %t/le.32
+# RUN: llvm-readelf -r %t/le.32 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le.32 | FileCheck --check-prefix=LE32-GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/le.32 | FileCheck --check-prefixes=LE32 %s
+
+## LA64 LD
+# RUN: ld.lld -shared %t/a.64.o -o %t/ld.64.so
+# RUN: llvm-readobj -r %t/ld.64.so | FileCheck --check-prefix=LD64-REL %s
+# RUN: llvm-readelf -x .got %t/ld.64.so | FileCheck --check-prefix=LD64-GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/ld.64.so | FileCheck --check-prefixes=LD64 %s
+
+## LA64 LD -> LE
+# RUN: ld.lld %t/a.64.o %t/tga.64.o -o %t/le.64
+# RUN: llvm-readelf -r %t/le.64 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le.64 | FileCheck --check-prefix=LE64-GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/le.64 | FileCheck --check-prefixes=LE64 %s
+
+## a@dtprel = st_value(a) = 0 is a link-time constant.
+# LD32-REL:      .rela.dyn {
+# LD32-REL-NEXT:   0x20280 R_LARCH_TLS_DTPMOD32 - 0x0
+# LD32-REL-NEXT: }
+# LD32-GOT:      section '.got':
+# LD32-GOT-NEXT: 0x00020280 00000000 00000000
+
+# LD64-REL:      .rela.dyn {
+# LD64-REL-NEXT:   0x20400 R_LARCH_TLS_DTPMOD64 - 0x0
+# LD64-REL-NEXT: }
+# LD64-GOT:      section '.got':
+# LD64-GOT-NEXT: 0x00020400 00000000 00000000 00000000 00000000
+
+## LA32: &DTPMOD(a) - . = 0x20280 - 0x101cc = 16429<<2
+# LD32:      101cc: pcaddi $a0, 16429
+# LD32-NEXT:        bl 48
+
+## LA64: &DTPMOD(a) - . = 0x20400 - 0x102e0 = 16456<<2
+# LD64:      102e0: pcaddi $a0, 16456
+# LD64-NEXT:        bl 44
+
+# NOREL: no relocations
+
+## a is local - its DTPMOD/DTPREL slots are link-time constants.
+## a@dtpmod = 1 (main module)
+# LE32-GOT: section '.got':
+# LE32-GOT-NEXT: 0x0003011c 01000000 00000000
+
+# LE64-GOT: section '.got':
+# LE64-GOT-NEXT: 0x000301d0 01000000 00000000 00000000 00000000
+
+## LA32: DTPMOD(.LANCHOR0) - . = 0x3011c - 0x20114 = 16386<<2
+# LE32:      20114: pcaddi $a0, 16386
+# LE32-NEXT:        bl 4
+
+## LA64: DTPMOD(.LANCHOR0) - . = 0x301d0 - 0x201c8 = 16386<<2
+# LE64:      201c8: pcaddi $a0, 16386
+# LE64-NEXT:        bl 4
+
+#--- a.s
+pcaddi $a0, %ld_pcrel_20(.LANCHOR0)
+bl %plt(__tls_get_addr)
+
+.section .tbss,"awT",@nobits
+.set .LANCHOR0, . + 0
+.zero 8
+
+#--- tga.s
+.globl __tls_get_addr
+__tls_get_addr:
diff --git a/lld/test/ELF/loongarch-tlsdesc-pcrel20-s2.s b/lld/test/ELF/loongarch-tlsdesc-pcrel20-s2.s
new file mode 100644
index 0000000..99e21d9
--- /dev/null
+++ b/lld/test/ELF/loongarch-tlsdesc-pcrel20-s2.s
@@ -0,0 +1,142 @@
+# REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=loongarch64 a.s -o a.64.o
+# RUN: llvm-mc -filetype=obj -triple=loongarch64 c.s -o c.64.o
+# RUN: ld.lld -shared -soname=c.64.so c.64.o -o c.64.so
+# RUN: llvm-mc -filetype=obj -triple=loongarch32 --defsym ELF32=1 a.s -o a.32.o
+# RUN: llvm-mc -filetype=obj -triple=loongarch32 --defsym ELF32=1 c.s -o c.32.o
+# RUN: ld.lld -shared -soname=c.32.so c.32.o -o c.32.so
+
+# RUN: ld.lld -shared -z now a.64.o c.64.o -o a.64.so
+# RUN: llvm-readobj -r -x .got a.64.so | FileCheck --check-prefix=GD64-RELA %s
+# RUN: llvm-objdump --no-show-raw-insn -h -d a.64.so | FileCheck %s --check-prefix=GD64
+
+# RUN: ld.lld -shared -z now a.64.o c.64.o -o rel.64.so -z rel
+# RUN: llvm-readobj -r -x .got rel.64.so | FileCheck --check-prefix=GD64-REL %s
+
+## FIXME: The transition frome TLSDESC to IE/LE has not yet been implemented.
+## Keep the dynamic relocations and hand them over to dynamic linker.
+
+# RUN: ld.lld -e 0 -z now a.64.o c.64.o -o a.64.le
+# RUN: llvm-readobj -r -x .got a.64.le | FileCheck --check-prefix=LE64-RELA %s
+
+# RUN: ld.lld -e 0 -z now a.64.o c.64.so -o a.64.ie
+# RUN: llvm-readobj -r -x .got a.64.ie | FileCheck --check-prefix=IE64-RELA %s
+
+## 32-bit code is mostly the same. We only test a few variants.
+
+# RUN: ld.lld -shared -z now a.32.o c.32.o -o rel.32.so -z rel
+# RUN: llvm-readobj -r -x .got rel.32.so | FileCheck --check-prefix=GD32-REL %s
+
+# GD64-RELA:      .rela.dyn {
+# GD64-RELA-NEXT:   0x203F0 R_LARCH_TLS_DESC64 - 0x7FF
+# GD64-RELA-NEXT:   0x203D0 R_LARCH_TLS_DESC64 a 0x0
+# GD64-RELA-NEXT:   0x203E0 R_LARCH_TLS_DESC64 c 0x0
+# GD64-RELA-NEXT: }
+# GD64-RELA:      Hex dump of section '.got':
+# GD64-RELA-NEXT: 0x000203d0 00000000 00000000 00000000 00000000 .
+# GD64-RELA-NEXT: 0x000203e0 00000000 00000000 00000000 00000000 .
+# GD64-RELA-NEXT: 0x000203f0 00000000 00000000 00000000 00000000 .
+
+# GD64-REL:      .rel.dyn {
+# GD64-REL-NEXT:   0x203D8 R_LARCH_TLS_DESC64 -
+# GD64-REL-NEXT:   0x203B8 R_LARCH_TLS_DESC64 a
+# GD64-REL-NEXT:   0x203C8 R_LARCH_TLS_DESC64 c
+# GD64-REL-NEXT: }
+# GD64-REL:      Hex dump of section '.got':
+# GD64-REL-NEXT: 0x000203b8 00000000 00000000 00000000 00000000 .
+# GD64-REL-NEXT: 0x000203c8 00000000 00000000 00000000 00000000 .
+# GD64-REL-NEXT: 0x000203d8 00000000 00000000 ff070000 00000000 .
+
+# GD64:      .got     00000030 00000000000203d0
+
+## &.got[a]-. = 0x203d0 - 0x102e0 = 16444<<2
+# GD64:        102e0: pcaddi $a0, 16444
+# GD64-NEXT:          ld.d $ra, $a0, 0
+# GD64-NEXT:          jirl $ra, $ra, 0
+# GD64-NEXT:          add.d $a1, $a0, $tp
+
+## &.got[b]-. = 0x203d0+32 - 0x102f0 = 16448<<2
+# GD64:        102f0: pcaddi $a0, 16448
+# GD64-NEXT:          ld.d $ra, $a0, 0
+# GD64-NEXT:          jirl $ra, $ra, 0
+# GD64-NEXT:          add.d $a2, $a0, $tp
+
+## &.got[c]-. = 0x203d0+16 - 0x10300 = 16440<<2
+# GD64:        10300: pcaddi $a0, 16440
+# GD64-NEXT:          ld.d $ra, $a0, 0
+# GD64-NEXT:          jirl $ra, $ra, 0
+# GD64-NEXT:          add.d $a3, $a0, $tp
+
+# LE64-RELA:      .rela.dyn {
+# LE64-RELA-NEXT:   0x30240 R_LARCH_TLS_DESC64 - 0x8
+# LE64-RELA-NEXT:   0x30250 R_LARCH_TLS_DESC64 - 0x800
+# LE64-RELA-NEXT:   0x30260 R_LARCH_TLS_DESC64 - 0x7FF
+# LE64-RELA-NEXT: }
+# LE64-RELA:      Hex dump of section '.got':
+# LE64-RELA-NEXT: 0x00030240 00000000 00000000 00000000 00000000 .
+# LE64-RELA-NEXT: 0x00030250 00000000 00000000 00000000 00000000 .
+# LE64-RELA-NEXT: 0x00030260 00000000 00000000 00000000 00000000 .
+
+# IE64-RELA:      .rela.dyn {
+# IE64-RELA-NEXT:   0x303C8 R_LARCH_TLS_DESC64 - 0x8
+# IE64-RELA-NEXT:   0x303E8 R_LARCH_TLS_DESC64 - 0x7FF
+# IE64-RELA-NEXT:   0x303D8 R_LARCH_TLS_DESC64 c 0x0
+# IE64-RELA-NEXT: }
+# IE64-RELA:      Hex dump of section '.got':
+# IE64-RELA-NEXT: 0x000303c8 00000000 00000000 00000000 00000000 .
+# IE64-RELA-NEXT: 0x000303d8 00000000 00000000 00000000 00000000 .
+# IE64-RELA-NEXT: 0x000303e8 00000000 00000000 00000000 00000000 .
+
+# GD32-REL:      .rel.dyn {
+# GD32-REL-NEXT:    0x20264 R_LARCH_TLS_DESC32 -
+# GD32-REL-NEXT:    0x20254 R_LARCH_TLS_DESC32 a
+# GD32-REL-NEXT:    0x2025C R_LARCH_TLS_DESC32 c
+# GD32-REL-NEXT: }
+# GD32-REL:      Hex dump of section '.got':
+# GD32-REL-NEXT: 0x00020254 00000000 00000000 00000000 00000000 .
+# GD32-REL-NEXT: 0x00020264 00000000 ff070000                   .
+
+#--- a.s
+.macro add dst, src1, src2
+.ifdef ELF32
+add.w \dst, \src1, \src2
+.else
+add.d \dst, \src1, \src2
+.endif
+.endm
+.macro load dst, src1, src2
+.ifdef ELF32
+ld.w \dst, \src1, \src2
+.else
+ld.d \dst, \src1, \src2
+.endif
+.endm
+
+pcaddi $a0, %desc_pcrel_20(a)
+load $ra, $a0, %desc_ld(a)
+jirl $ra, $ra, %desc_call(a)
+add $a1, $a0, $tp
+
+pcaddi $a0, %desc_pcrel_20(b)
+load $ra, $a0, %desc_ld(b)
+jirl $ra, $ra, %desc_call(b)
+add $a2, $a0, $tp
+
+pcaddi $a0, %desc_pcrel_20(c)
+load $ra, $a0, %desc_ld(c)
+jirl $ra, $ra, %desc_call(c)
+add $a3, $a0, $tp
+
+.section .tbss,"awT",@nobits
+.globl a
+.zero 8
+a:
+.zero 2039  ## Place b at 0x7ff
+b:
+.zero 1
+
+#--- c.s
+.section .tbss,"awT",@nobits
+.globl c
+c: .zero 4
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
index 187370e..5d0a3e3 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -1145,8 +1145,8 @@ Status GDBRemoteCommunication::StartDebugserverProcess(
       if (socket_pipe.CanWrite())
         socket_pipe.CloseWriteFileDescriptor();
       if (socket_pipe.CanRead()) {
-        char port_cstr[PATH_MAX] = {0};
-        port_cstr[0] = '\0';
+        // The port number may be up to "65535\0".
+        char port_cstr[6] = {0};
         size_t num_bytes = sizeof(port_cstr);
         // Read port from pipe with 10 second timeout.
         error = socket_pipe.ReadWithTimeout(
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index f70efe5..0386e3b 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -4726,66 +4726,67 @@ TypeSystemClang::GetFloatTypeSemantics(size_t byte_size) {
 }
 
 std::optional<uint64_t>
+TypeSystemClang::GetObjCBitSize(QualType qual_type,
+                                ExecutionContextScope *exe_scope) {
+  assert(qual_type->isObjCObjectOrInterfaceType());
+  ExecutionContext exe_ctx(exe_scope);
+  if (Process *process = exe_ctx.GetProcessPtr()) {
+    if (ObjCLanguageRuntime *objc_runtime =
+            ObjCLanguageRuntime::Get(*process)) {
+      if (std::optional<uint64_t> bit_size =
+              objc_runtime->GetTypeBitSize(GetType(qual_type)))
+        return *bit_size;
+    }
+  } else {
+    static bool g_printed = false;
+    if (!g_printed) {
+      StreamString s;
+      DumpTypeDescription(qual_type.getAsOpaquePtr(), s);
+
+      llvm::outs() << "warning: trying to determine the size of type ";
+      llvm::outs() << s.GetString() << "\n";
+      llvm::outs() << "without a valid ExecutionContext. this is not "
+                      "reliable. please file a bug against LLDB.\n";
+      llvm::outs() << "backtrace:\n";
+      llvm::sys::PrintStackTrace(llvm::outs());
+      llvm::outs() << "\n";
+      g_printed = true;
+    }
+  }
+
+  return getASTContext().getTypeSize(qual_type) +
+         getASTContext().getTypeSize(getASTContext().ObjCBuiltinClassTy);
+}
+
+std::optional<uint64_t>
 TypeSystemClang::GetBitSize(lldb::opaque_compiler_type_t type,
                             ExecutionContextScope *exe_scope) {
-  if (GetCompleteType(type)) {
-    clang::QualType qual_type(GetCanonicalQualType(type));
-    const clang::Type::TypeClass type_class = qual_type->getTypeClass();
-    switch (type_class) {
-    case clang::Type::Record:
-      if (GetCompleteType(type))
-        return getASTContext().getTypeSize(qual_type);
-      else
-        return std::nullopt;
-      break;
+  if (!GetCompleteType(type))
+    return std::nullopt;
 
-    case clang::Type::ObjCInterface:
-    case clang::Type::ObjCObject: {
-      ExecutionContext exe_ctx(exe_scope);
-      Process *process = exe_ctx.GetProcessPtr();
-      if (process) {
-        if (ObjCLanguageRuntime *objc_runtime =
-                ObjCLanguageRuntime::Get(*process)) {
-          if (std::optional<uint64_t> bit_size =
-                  objc_runtime->GetTypeBitSize(GetType(qual_type)))
-            return *bit_size;
-        }
-      } else {
-        static bool g_printed = false;
-        if (!g_printed) {
-          StreamString s;
-          DumpTypeDescription(type, s);
-
-          llvm::outs() << "warning: trying to determine the size of type ";
-          llvm::outs() << s.GetString() << "\n";
-          llvm::outs() << "without a valid ExecutionContext. this is not "
-                          "reliable. please file a bug against LLDB.\n";
-          llvm::outs() << "backtrace:\n";
-          llvm::sys::PrintStackTrace(llvm::outs());
-          llvm::outs() << "\n";
-          g_printed = true;
-        }
-      }
-    }
-      [[fallthrough]];
-    default:
-      const uint32_t bit_size = getASTContext().getTypeSize(qual_type);
-      if (bit_size == 0) {
-        if (qual_type->isIncompleteArrayType())
-          return getASTContext().getTypeSize(
-              qual_type->getArrayElementTypeNoTypeQual()
-                  ->getCanonicalTypeUnqualified());
-      }
-      if (qual_type->isObjCObjectOrInterfaceType())
-        return bit_size +
-               getASTContext().getTypeSize(getASTContext().ObjCBuiltinClassTy);
-      // Function types actually have a size of 0, that's not an error.
-      if (qual_type->isFunctionProtoType())
-        return bit_size;
-      if (bit_size)
-        return bit_size;
-    }
+  clang::QualType qual_type(GetCanonicalQualType(type));
+  const clang::Type::TypeClass type_class = qual_type->getTypeClass();
+  switch (type_class) {
+  case clang::Type::FunctionProto:
+  case clang::Type::Record:
+    return getASTContext().getTypeSize(qual_type);
+  case clang::Type::ObjCInterface:
+  case clang::Type::ObjCObject:
+    return GetObjCBitSize(qual_type, exe_scope);
+  case clang::Type::IncompleteArray: {
+    const uint64_t bit_size = getASTContext().getTypeSize(qual_type);
+    if (bit_size == 0)
+      return getASTContext().getTypeSize(
+          qual_type->getArrayElementTypeNoTypeQual()
+              ->getCanonicalTypeUnqualified());
+
+    return bit_size;
+  }
+  default:
+    if (const uint64_t bit_size = getASTContext().getTypeSize(qual_type))
+      return bit_size;
   }
+
   return std::nullopt;
 }
 
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index d67b7a4..70722eb 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -1172,6 +1172,9 @@ private:
   /// on creation of a new instance.
   void LogCreation() const;
 
+  std::optional<uint64_t> GetObjCBitSize(clang::QualType qual_type,
+                                         ExecutionContextScope *exe_scope);
+
   // Classes that inherit from TypeSystemClang can see and modify these
   std::string m_target_triple;
   std::unique_ptr<clang::ASTContext> m_ast_up;
diff --git a/lldb/test/API/lang/cpp/class-template-non-type-parameter-pack/TestClassTemplateNonTypeParameterPack.py b/lldb/test/API/lang/cpp/class-template-non-type-parameter-pack/TestClassTemplateNonTypeParameterPack.py
index 9e484e0..730537d 100644
--- a/lldb/test/API/lang/cpp/class-template-non-type-parameter-pack/TestClassTemplateNonTypeParameterPack.py
+++ b/lldb/test/API/lang/cpp/class-template-non-type-parameter-pack/TestClassTemplateNonTypeParameterPack.py
@@ -5,9 +5,6 @@ from lldbsuite.test import lldbutil
 
 
 class TestCaseClassTemplateNonTypeParameterPack(TestBase):
-    @expectedFailureAll(
-        oslist=["windows"], archs=["i[3-6]86", "x86_64"]
-    )  # Fails to read memory from target.
     @no_debug_info_test
     def test(self):
         self.build()
diff --git a/lldb/test/API/lang/cpp/class-template-type-parameter-pack/TestClassTemplateTypeParameterPack.py b/lldb/test/API/lang/cpp/class-template-type-parameter-pack/TestClassTemplateTypeParameterPack.py
index 102c00d..1ed643e 100644
--- a/lldb/test/API/lang/cpp/class-template-type-parameter-pack/TestClassTemplateTypeParameterPack.py
+++ b/lldb/test/API/lang/cpp/class-template-type-parameter-pack/TestClassTemplateTypeParameterPack.py
@@ -5,9 +5,6 @@ from lldbsuite.test import lldbutil
 
 
 class TestCaseClassTemplateTypeParameterPack(TestBase):
-    @expectedFailureAll(
-        oslist=["windows"], archs=["i[3-6]86", "x86_64"]
-    )  # Fails to read memory from target.
     @no_debug_info_test
     def test(self):
         self.build()
diff --git a/lldb/tools/lldb-dap/README.md b/lldb/tools/lldb-dap/README.md
index 8ecbaf7..11a14d2 100644
--- a/lldb/tools/lldb-dap/README.md
+++ b/lldb/tools/lldb-dap/README.md
@@ -157,6 +157,20 @@ locally on port `2345`.
 }
 ```
 
+You can also use the `gdb-remote-port` parameter to send an attach request
+to a debug server running on the current machine,
+instead of using the custom command `attachCommands`.
+
+```javascript
+{
+  "name": "Local Debug Server",
+  "type": "lldb-dap",
+  "request": "attach",
+  "program": "/tmp/a.out",
+  "gdb-remote-port": 2345,
+}
+```
+
 #### Connect to a Debug Server on Another Machine
 
 This connects to a debug server running on another machine with hostname
@@ -173,6 +187,23 @@ port `5678` of that other machine.
 }
 ```
 
+You can also use the `gdb-remote-hostname` and `gdb-remote-port` parameters
+to send an attach request to a debug server running on a different machine,
+instead of custom command `attachCommands`.
+The default hostname being used `localhost`.
+
+
+```javascript
+{
+  "name": "Local Debug Server",
+  "type": "lldb-dap",
+  "request": "attach",
+  "program": "/tmp/a.out",
+  "gdb-remote-port": 5678,
+  "gdb-remote-hostname": "hostname",
+}
+```
+
 ## Custom debugger commands
 
 The `lldb-dap` tool includes additional custom commands to support the Debug
diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json
index fd5de30..97e4efe 100644
--- a/lldb/tools/lldb-dap/package.json
+++ b/lldb/tools/lldb-dap/package.json
@@ -1,7 +1,7 @@
 {
   "name": "lldb-dap",
   "displayName": "LLDB DAP",
-  "version": "0.2.2",
+  "version": "0.2.3",
   "publisher": "llvm-vs-code-extensions",
   "homepage": "https://lldb.llvm.org",
   "description": "LLDB debugging from VSCode",
@@ -353,7 +353,7 @@
                   "number",
                   "string"
                 ],
-                "description": "TCP/IP port to attach to. Specifying both pid and port is an error."
+                "description": "TCP/IP port to attach to a remote system. Specifying both pid and port is an error."
               },
               "gdb-remote-hostname": {
                 "type": "string",
diff --git a/llvm/bindings/ocaml/llvm/llvm.ml b/llvm/bindings/ocaml/llvm/llvm.ml
index 86b010e..908e665 100644
--- a/llvm/bindings/ocaml/llvm/llvm.ml
+++ b/llvm/bindings/ocaml/llvm/llvm.ml
@@ -519,7 +519,6 @@ external vector_size : lltype -> int = "llvm_vector_size"
 (*--... Operations on other types ..........................................--*)
 external void_type : llcontext -> lltype = "llvm_void_type"
 external label_type : llcontext -> lltype = "llvm_label_type"
-external x86_mmx_type : llcontext -> lltype = "llvm_x86_mmx_type"
 external type_by_name : llmodule -> string -> lltype option = "llvm_type_by_name"
 
 external classify_value : llvalue -> ValueKind.t = "llvm_classify_value"
diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst
index e00165c..231de56 100644
--- a/llvm/docs/ProgrammersManual.rst
+++ b/llvm/docs/ProgrammersManual.rst
@@ -1392,6 +1392,7 @@ How to use reduce-chunk-list:
 First, Figure out the number of calls to the debug counter you want to minimize.
 To do so, run the compilation command causing you want to minimize with `-print-debug-counter` adding a `-mllvm` if needed.
 Than find the line with the counter of interest. it should look like:
+
 .. code-block:: none
 
   my-counter               : {5678,empty}
@@ -1400,6 +1401,7 @@ The number of calls to `my-counter` is 5678
 
 Than Find the minimum set of chunks that is interesting, with `reduce-chunk-list`.
 Build a reproducer script like:
+
 .. code-block:: bash
 
   #! /bin/bash
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 67d474d..f71cd5b 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -118,12 +118,13 @@ public:
       return;
     }
 
-    const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
+    const KeyT EmptyKey = getEmptyKey();
     if (std::is_trivially_destructible<ValueT>::value) {
       // Use a simpler loop when values don't need destruction.
       for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P)
         P->getFirst() = EmptyKey;
     } else {
+      const KeyT TombstoneKey = getTombstoneKey();
       unsigned NumEntries = getNumEntries();
       for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) {
         if (!KeyInfoT::isEqual(P->getFirst(), EmptyKey)) {
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index cc40d2e..43eac14 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -200,9 +200,7 @@ public:
   ///
   /// Only checks sets with elements in \p CheckDeps.
   bool areDepsSafe(const DepCandidates &AccessSets,
-                   const MemAccessInfoList &CheckDeps,
-                   const DenseMap<Value *, SmallVector<const Value *, 16>>
-                       &UnderlyingObjects);
+                   const MemAccessInfoList &CheckDeps);
 
   /// No memory dependence was encountered that would inhibit
   /// vectorization.
@@ -352,11 +350,8 @@ private:
   /// element access it records this distance in \p MinDepDistBytes (if this
   /// distance is smaller than any other distance encountered so far).
   /// Otherwise, this function returns true signaling a possible dependence.
-  Dependence::DepType
-  isDependent(const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
-              unsigned BIdx,
-              const DenseMap<Value *, SmallVector<const Value *, 16>>
-                  &UnderlyingObjects);
+  Dependence::DepType isDependent(const MemAccessInfo &A, unsigned AIdx,
+                                  const MemAccessInfo &B, unsigned BIdx);
 
   /// Check whether the data dependence could prevent store-load
   /// forwarding.
@@ -393,11 +388,9 @@ private:
   /// determined, or a struct containing (Distance, Stride, TypeSize, AIsWrite,
   /// BIsWrite).
   std::variant<Dependence::DepType, DepDistanceStrideAndSizeInfo>
-  getDependenceDistanceStrideAndSize(
-      const MemAccessInfo &A, Instruction *AInst, const MemAccessInfo &B,
-      Instruction *BInst,
-      const DenseMap<Value *, SmallVector<const Value *, 16>>
-          &UnderlyingObjects);
+  getDependenceDistanceStrideAndSize(const MemAccessInfo &A, Instruction *AInst,
+                                     const MemAccessInfo &B,
+                                     Instruction *BInst);
 };
 
 class RuntimePointerChecking;
@@ -799,7 +792,8 @@ replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
                           Value *Ptr);
 
 /// If the pointer has a constant stride return it in units of the access type
-/// size.  Otherwise return std::nullopt.
+/// size. If the pointer is loop-invariant, return 0. Otherwise return
+/// std::nullopt.
 ///
 /// Ensure that it does not wrap in the address space, assuming the predicate
 /// associated with \p PSE is true.
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index f57be39..36d1b47 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -225,6 +225,9 @@ private:
   /// split stack prologue.
   bool HasNoSplitStack = false;
 
+  /// True if debugging information is available in this module.
+  bool DbgInfoAvailable = false;
+
 protected:
   explicit AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);
 
@@ -430,6 +433,9 @@ public:
   /// Get the CFISection type for the module.
   CFISection getModuleCFISectionType() const { return ModuleCFISection; }
 
+  /// Returns true if valid debug info is present.
+  bool hasDebugInfo() const { return DbgInfoAvailable; }
+
   bool needsSEHMoves();
 
   /// Since emitting CFI unwind information is entangled with supporting the
diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
index 45a47d7..04667c0 100644
--- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -188,6 +188,9 @@ public:
   /// SelectionDAGISel::PrepareEHLandingPad().
   unsigned ExceptionPointerVirtReg, ExceptionSelectorVirtReg;
 
+  /// The current call site index being processed, if any. 0 if none.
+  unsigned CurCallSite = 0;
+
   /// Collection of dbg.declare instructions handled after argument
   /// lowering and before ISel proper.
   SmallPtrSet<const DbgDeclareInst *, 8> PreprocessedDbgDeclares;
@@ -281,6 +284,12 @@ public:
   Register getCatchPadExceptionPointerVReg(const Value *CPI,
                                            const TargetRegisterClass *RC);
 
+  /// Set the call site currently being processed.
+  void setCurrentCallSite(unsigned Site) { CurCallSite = Site; }
+
+  /// Get the call site currently being processed, if any. Return zero if none.
+  unsigned getCurrentCallSite() { return CurCallSite; }
+
 private:
   /// LiveOutRegInfo - Information about live out vregs.
   IndexedMap<LiveOutInfo, VirtReg2IndexFunctor> LiveOutRegInfo;
diff --git a/llvm/include/llvm/CodeGen/MIRPrinter.h b/llvm/include/llvm/CodeGen/MIRPrinter.h
index e98ef72..85bd674 100644
--- a/llvm/include/llvm/CodeGen/MIRPrinter.h
+++ b/llvm/include/llvm/CodeGen/MIRPrinter.h
@@ -21,6 +21,7 @@ namespace llvm {
 
 class MachineBasicBlock;
 class MachineFunction;
+class MachineModuleInfo;
 class Module;
 template <typename T> class SmallVectorImpl;
 
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index 6e7292a..e1d03fe 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -59,7 +59,6 @@ class MachineConstantPool;
 class MachineFrameInfo;
 class MachineFunction;
 class MachineJumpTableInfo;
-class MachineModuleInfo;
 class MachineRegisterInfo;
 class MCContext;
 class MCInstrDesc;
@@ -260,7 +259,6 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
   const LLVMTargetMachine &Target;
   const TargetSubtargetInfo *STI;
   MCContext &Ctx;
-  MachineModuleInfo &MMI;
 
   // RegInfo - Information about each register in use in the function.
   MachineRegisterInfo *RegInfo;
@@ -395,15 +393,15 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
 
   /// \}
 
-  /// Clear all the members of this MachineFunction, but the ones used
-  /// to initialize again the MachineFunction.
-  /// More specifically, this deallocates all the dynamically allocated
-  /// objects and get rid of all the XXXInfo data structure, but keep
-  /// unchanged the references to Fn, Target, MMI, and FunctionNumber.
+  /// Clear all the members of this MachineFunction, but the ones used to
+  /// initialize again the MachineFunction.  More specifically, this deallocates
+  /// all the dynamically allocated objects and get rids of all the XXXInfo data
+  /// structure, but keeps unchanged the references to Fn, Target, and
+  /// FunctionNumber.
   void clear();
   /// Allocate and initialize the different members.
   /// In particular, the XXXInfo data structure.
-  /// \pre Fn, Target, MMI, and FunctionNumber are properly set.
+  /// \pre Fn, Target, and FunctionNumber are properly set.
   void init();
 
 public:
@@ -632,8 +630,8 @@ public:
   const static unsigned int DebugOperandMemNumber;
 
   MachineFunction(Function &F, const LLVMTargetMachine &Target,
-                  const TargetSubtargetInfo &STI, unsigned FunctionNum,
-                  MachineModuleInfo &MMI);
+                  const TargetSubtargetInfo &STI, MCContext &Ctx,
+                  unsigned FunctionNum);
   MachineFunction(const MachineFunction &) = delete;
   MachineFunction &operator=(const MachineFunction &) = delete;
   ~MachineFunction();
@@ -665,7 +663,6 @@ public:
 
   GISelChangeObserver *getObserver() const { return Observer; }
 
-  MachineModuleInfo &getMMI() const { return MMI; }
   MCContext &getContext() const { return Ctx; }
 
   /// Returns the Section this function belongs to.
diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 229f515..04c8144 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -1723,7 +1723,7 @@ public:
   /// Return true if it is safe to move this instruction. If
   /// SawStore is set to true, it means that there is a store (or call) between
   /// the instruction's location and its intended destination.
-  bool isSafeToMove(AAResults *AA, bool &SawStore) const;
+  bool isSafeToMove(bool &SawStore) const;
 
   /// Returns true if this instruction's memory access aliases the memory
   /// access of Other.
diff --git a/llvm/include/llvm/CodeGen/MachineModuleInfo.h b/llvm/include/llvm/CodeGen/MachineModuleInfo.h
index 97b439c..310cc4b 100644
--- a/llvm/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineModuleInfo.h
@@ -99,27 +99,6 @@ class MachineModuleInfo {
   /// want.
   MachineModuleInfoImpl *ObjFileMMI;
 
-  /// \name Exception Handling
-  /// \{
-
-  /// The current call site index being processed, if any. 0 if none.
-  unsigned CurCallSite = 0;
-
-  /// \}
-
-  // TODO: Ideally, what we'd like is to have a switch that allows emitting
-  // synchronous (precise at call-sites only) CFA into .eh_frame. However,
-  // even under this switch, we'd like .debug_frame to be precise when using
-  // -g. At this moment, there's no way to specify that some CFI directives
-  // go into .eh_frame only, while others go into .debug_frame only.
-
-  /// True if debugging information is available in this module.
-  bool DbgInfoAvailable = false;
-
-  /// True if this module is being built for windows/msvc, and uses floating
-  /// point.  This is used to emit an undefined reference to _fltused.
-  bool UsesMSVCFloatingPoint = false;
-
   /// Maps IR Functions to their corresponding MachineFunctions.
   DenseMap<const Function*, std::unique_ptr<MachineFunction>> MachineFunctions;
   /// Next unique number available for a MachineFunction.
@@ -186,23 +165,6 @@ public:
     return const_cast<MachineModuleInfo*>(this)->getObjFileInfo<Ty>();
   }
 
-  /// Returns true if valid debug info is present.
-  bool hasDebugInfo() const { return DbgInfoAvailable; }
-
-  bool usesMSVCFloatingPoint() const { return UsesMSVCFloatingPoint; }
-
-  void setUsesMSVCFloatingPoint(bool b) { UsesMSVCFloatingPoint = b; }
-
-  /// \name Exception Handling
-  /// \{
-
-  /// Set the call site currently being processed.
-  void setCurrentCallSite(unsigned Site) { CurCallSite = Site; }
-
-  /// Get the call site currently being processed, if any.  return zero if
-  /// none.
-  unsigned getCurrentCallSite() { return CurCallSite; }
-
   /// \}
 }; // End class MachineModuleInfo
 
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index a905c85..966e04f 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -490,6 +490,12 @@ m_c_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC) {
                                                          CC);
 }
 
+template <typename T0_P, typename T1_P, typename T2_P>
+inline TernaryOpc_match<T0_P, T1_P, T2_P>
+m_Select(const T0_P &Cond, const T1_P &T, const T2_P &F) {
+  return TernaryOpc_match<T0_P, T1_P, T2_P>(ISD::SELECT, Cond, T, F);
+}
+
 // === Binary operations ===
 template <typename LHS_P, typename RHS_P, bool Commutable = false,
           bool ExcludeChain = false>
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 24eab7b..6a80c8c 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -76,6 +76,7 @@ struct KnownBits;
 class LLVMContext;
 class MachineBasicBlock;
 class MachineConstantPoolValue;
+class MachineModuleInfo;
 class MCSymbol;
 class OptimizationRemarkEmitter;
 class ProfileSummaryInfo;
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 54f71c9..882cade 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -46,6 +46,7 @@ class LiveIntervals;
 class LiveVariables;
 class MachineLoop;
 class MachineMemOperand;
+class MachineModuleInfo;
 class MachineRegisterInfo;
 class MCAsmInfo;
 class MCInst;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 646d2f7..a2b124d 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -725,11 +725,6 @@ public:
 
   const MemAccessInfoList &getDependenciesToCheck() const { return CheckDeps; }
 
-  const DenseMap<Value *, SmallVector<const Value *, 16>> &
-  getUnderlyingObjects() const {
-    return UnderlyingObjects;
-  }
-
 private:
   typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap;
 
@@ -1455,22 +1450,23 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
 }
 
 /// Check whether the access through \p Ptr has a constant stride.
-std::optional<int64_t> llvm::getPtrStride(PredicatedScalarEvolution &PSE,
-                                          Type *AccessTy, Value *Ptr,
-                                          const Loop *Lp,
-                                          const DenseMap<Value *, const SCEV *> &StridesMap,
-                                          bool Assume, bool ShouldCheckWrap) {
+std::optional<int64_t>
+llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
+                   const Loop *Lp,
+                   const DenseMap<Value *, const SCEV *> &StridesMap,
+                   bool Assume, bool ShouldCheckWrap) {
+  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
+  if (PSE.getSE()->isLoopInvariant(PtrScev, Lp))
+    return {0};
+
   Type *Ty = Ptr->getType();
   assert(Ty->isPointerTy() && "Unexpected non-ptr");
-
   if (isa<ScalableVectorType>(AccessTy)) {
     LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
                       << "\n");
     return std::nullopt;
   }
 
-  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
-
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
   if (Assume && !AR)
     AR = PSE.getAsAddRec(Ptr);
@@ -1897,23 +1893,11 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
   return ScaledDist % Stride;
 }
 
-/// Returns true if any of the underlying objects has a loop varying address,
-/// i.e. may change in \p L.
-static bool
-isLoopVariantIndirectAddress(ArrayRef<const Value *> UnderlyingObjects,
-                             ScalarEvolution &SE, const Loop *L) {
-  return any_of(UnderlyingObjects, [&SE, L](const Value *UO) {
-    return !SE.isLoopInvariant(SE.getSCEV(const_cast<Value *>(UO)), L);
-  });
-}
-
 std::variant<MemoryDepChecker::Dependence::DepType,
              MemoryDepChecker::DepDistanceStrideAndSizeInfo>
 MemoryDepChecker::getDependenceDistanceStrideAndSize(
     const AccessAnalysis::MemAccessInfo &A, Instruction *AInst,
-    const AccessAnalysis::MemAccessInfo &B, Instruction *BInst,
-    const DenseMap<Value *, SmallVector<const Value *, 16>>
-        &UnderlyingObjects) {
+    const AccessAnalysis::MemAccessInfo &B, Instruction *BInst) {
   const auto &DL = InnermostLoop->getHeader()->getDataLayout();
   auto &SE = *PSE.getSE();
   const auto &[APtr, AIsWrite] = A;
@@ -1931,12 +1915,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
       BPtr->getType()->getPointerAddressSpace())
     return MemoryDepChecker::Dependence::Unknown;
 
-  int64_t StrideAPtr =
-      getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true)
-          .value_or(0);
-  int64_t StrideBPtr =
-      getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true)
-          .value_or(0);
+  std::optional<int64_t> StrideAPtr =
+      getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true);
+  std::optional<int64_t> StrideBPtr =
+      getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true);
 
   const SCEV *Src = PSE.getSCEV(APtr);
   const SCEV *Sink = PSE.getSCEV(BPtr);
@@ -1944,26 +1926,19 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
   // If the induction step is negative we have to invert source and sink of the
   // dependence when measuring the distance between them. We should not swap
   // AIsWrite with BIsWrite, as their uses expect them in program order.
-  if (StrideAPtr < 0) {
+  if (StrideAPtr && *StrideAPtr < 0) {
     std::swap(Src, Sink);
     std::swap(AInst, BInst);
+    std::swap(StrideAPtr, StrideBPtr);
   }
 
   const SCEV *Dist = SE.getMinusSCEV(Sink, Src);
 
   LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
-                    << "(Induction step: " << StrideAPtr << ")\n");
+                    << "\n");
   LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst
                     << ": " << *Dist << "\n");
 
-  // Needs accesses where the addresses of the accessed underlying objects do
-  // not change within the loop.
-  if (isLoopVariantIndirectAddress(UnderlyingObjects.find(APtr)->second, SE,
-                                   InnermostLoop) ||
-      isLoopVariantIndirectAddress(UnderlyingObjects.find(BPtr)->second, SE,
-                                   InnermostLoop))
-    return MemoryDepChecker::Dependence::IndirectUnsafe;
-
   // Check if we can prove that Sink only accesses memory after Src's end or
   // vice versa. At the moment this is limited to cases where either source or
   // sink are loop invariant to avoid compile-time increases. This is not
@@ -1985,12 +1960,33 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
     }
   }
 
-  // Need accesses with constant strides and the same direction. We don't want
-  // to vectorize "A[B[i]] += ..." and similar code or pointer arithmetic that
-  // could wrap in the address space.
-  if (!StrideAPtr || !StrideBPtr || (StrideAPtr > 0 && StrideBPtr < 0) ||
-      (StrideAPtr < 0 && StrideBPtr > 0)) {
+  // Need accesses with constant strides and the same direction for further
+  // dependence analysis. We don't want to vectorize "A[B[i]] += ..." and
+  // similar code or pointer arithmetic that could wrap in the address space.
+
+  // If either Src or Sink are not strided (i.e. not a non-wrapping AddRec) and
+  // not loop-invariant (stride will be 0 in that case), we cannot analyze the
+  // dependence further and also cannot generate runtime checks.
+  if (!StrideAPtr || !StrideBPtr) {
     LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
+    return MemoryDepChecker::Dependence::IndirectUnsafe;
+  }
+
+  int64_t StrideAPtrInt = *StrideAPtr;
+  int64_t StrideBPtrInt = *StrideBPtr;
+  LLVM_DEBUG(dbgs() << "LAA:  Src induction step: " << StrideAPtrInt
+                    << " Sink induction step: " << StrideBPtrInt << "\n");
+  // At least Src or Sink are loop invariant and the other is strided or
+  // invariant. We can generate a runtime check to disambiguate the accesses.
+  if (StrideAPtrInt == 0 || StrideBPtrInt == 0)
+    return MemoryDepChecker::Dependence::Unknown;
+
+  // Both Src and Sink have a constant stride, check if they are in the same
+  // direction.
+  if ((StrideAPtrInt > 0 && StrideBPtrInt < 0) ||
+      (StrideAPtrInt < 0 && StrideBPtrInt > 0)) {
+    LLVM_DEBUG(
+        dbgs() << "Pointer access with strides in different directions\n");
     return MemoryDepChecker::Dependence::Unknown;
   }
 
@@ -1999,22 +1995,20 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
       DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
   if (!HasSameSize)
     TypeByteSize = 0;
-  return DepDistanceStrideAndSizeInfo(Dist, std::abs(StrideAPtr),
-                                      std::abs(StrideBPtr), TypeByteSize,
+  return DepDistanceStrideAndSizeInfo(Dist, std::abs(StrideAPtrInt),
+                                      std::abs(StrideBPtrInt), TypeByteSize,
                                       AIsWrite, BIsWrite);
 }
 
-MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
-    const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
-    unsigned BIdx,
-    const DenseMap<Value *, SmallVector<const Value *, 16>>
-        &UnderlyingObjects) {
+MemoryDepChecker::Dependence::DepType
+MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
+                              const MemAccessInfo &B, unsigned BIdx) {
   assert(AIdx < BIdx && "Must pass arguments in program order");
 
   // Get the dependence distance, stride, type size and what access writes for
   // the dependence between A and B.
-  auto Res = getDependenceDistanceStrideAndSize(
-      A, InstMap[AIdx], B, InstMap[BIdx], UnderlyingObjects);
+  auto Res =
+      getDependenceDistanceStrideAndSize(A, InstMap[AIdx], B, InstMap[BIdx]);
   if (std::holds_alternative<Dependence::DepType>(Res))
     return std::get<Dependence::DepType>(Res);
 
@@ -2248,10 +2242,8 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
   return Dependence::BackwardVectorizable;
 }
 
-bool MemoryDepChecker::areDepsSafe(
-    const DepCandidates &AccessSets, const MemAccessInfoList &CheckDeps,
-    const DenseMap<Value *, SmallVector<const Value *, 16>>
-        &UnderlyingObjects) {
+bool MemoryDepChecker::areDepsSafe(const DepCandidates &AccessSets,
+                                   const MemAccessInfoList &CheckDeps) {
 
   MinDepDistBytes = -1;
   SmallPtrSet<MemAccessInfo, 8> Visited;
@@ -2294,8 +2286,8 @@ bool MemoryDepChecker::areDepsSafe(
             if (*I1 > *I2)
               std::swap(A, B);
 
-            Dependence::DepType Type = isDependent(*A.first, A.second, *B.first,
-                                                   B.second, UnderlyingObjects);
+            Dependence::DepType Type =
+                isDependent(*A.first, A.second, *B.first, B.second);
             mergeInStatus(Dependence::isSafeForVectorization(Type));
 
             // Gather dependences unless we accumulated MaxDependences
@@ -2650,8 +2642,7 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
   if (Accesses.isDependencyCheckNeeded()) {
     LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
     DepsAreSafe = DepChecker->areDepsSafe(DependentAccesses,
-                                          Accesses.getDependenciesToCheck(),
-                                          Accesses.getUnderlyingObjects());
+                                          Accesses.getDependenciesToCheck());
 
     if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeCheck()) {
       LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 2297b27..4e35664 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -436,6 +436,7 @@ bool AsmPrinter::doInitialization(Module &M) {
   MMI = MMIWP ? &MMIWP->getMMI() : nullptr;
   HasSplitStack = false;
   HasNoSplitStack = false;
+  DbgInfoAvailable = !M.debug_compile_units().empty();
 
   AddrLabelSymbols = nullptr;
 
@@ -541,8 +542,7 @@ bool AsmPrinter::doInitialization(Module &M) {
     if (EmitCodeView && TM.getTargetTriple().isOSWindows())
       DebugHandlers.push_back(std::make_unique<CodeViewDebug>(this));
     if (!EmitCodeView || M.getDwarfVersion()) {
-      assert(MMI && "MMI could not be nullptr here!");
-      if (MMI->hasDebugInfo()) {
+      if (hasDebugInfo()) {
         DD = new DwarfDebug(this);
         DebugHandlers.push_back(std::unique_ptr<DwarfDebug>(DD));
       }
@@ -1277,8 +1277,7 @@ AsmPrinter::getFunctionCFISectionType(const Function &F) const {
   if (MAI->usesCFIWithoutEH() && F.hasUWTable())
     return CFISection::EH;
 
-  assert(MMI != nullptr && "Invalid machine module info");
-  if (MMI->hasDebugInfo() || TM.Options.ForceDwarfFrameSection)
+  if (hasDebugInfo() || TM.Options.ForceDwarfFrameSection)
     return CFISection::Debug;
 
   return CFISection::None;
@@ -1669,10 +1668,9 @@ void AsmPrinter::emitPCSections(const MachineFunction &MF) {
 }
 
 /// Returns true if function begin and end labels should be emitted.
-static bool needFuncLabels(const MachineFunction &MF,
-                           const MachineModuleInfo &MMI) {
-  if (!MF.getLandingPads().empty() || MF.hasEHFunclets() ||
-      MMI.hasDebugInfo() ||
+static bool needFuncLabels(const MachineFunction &MF, const AsmPrinter &Asm) {
+  if (Asm.hasDebugInfo() || !MF.getLandingPads().empty() ||
+      MF.hasEHFunclets() ||
       MF.getFunction().hasMetadata(LLVMContext::MD_pcsections))
     return true;
 
@@ -1944,7 +1942,7 @@ void AsmPrinter::emitFunctionBody() {
   // are automatically sized.
   bool EmitFunctionSize = MAI->hasDotTypeDotSizeDirective() && !TT.isWasm();
 
-  if (needFuncLabels(*MF, *MMI) || EmitFunctionSize) {
+  if (EmitFunctionSize || needFuncLabels(*MF, *this)) {
     // Create a symbol for the end of function.
     CurrentFnEnd = createTempSymbol("func_end");
     OutStreamer->emitLabel(CurrentFnEnd);
@@ -2588,7 +2586,7 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   if (F.hasFnAttribute("patchable-function-entry") ||
       F.hasFnAttribute("function-instrument") ||
       F.hasFnAttribute("xray-instruction-threshold") ||
-      needFuncLabels(MF, *MMI) || NeedsLocalForSize ||
+      needFuncLabels(MF, *this) || NeedsLocalForSize ||
       MF.getTarget().Options.EmitStackSizeSection ||
       MF.getTarget().Options.BBAddrMap || MF.hasBBLabels()) {
     CurrentFnBegin = createTempSymbol("func_begin");
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index dddc08b..7700ffd 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -613,7 +613,7 @@ static SourceLanguage MapDWLangToCVLang(unsigned DWLang) {
 void CodeViewDebug::beginModule(Module *M) {
   // If module doesn't have named metadata anchors or COFF debug section
   // is not available, skip any debug info related stuff.
-  if (!MMI->hasDebugInfo() ||
+  if (!Asm->hasDebugInfo() ||
       !Asm->getObjFileLowering().getCOFFDebugSymbolsSection()) {
     Asm = nullptr;
     return;
@@ -636,7 +636,7 @@ void CodeViewDebug::beginModule(Module *M) {
 }
 
 void CodeViewDebug::endModule() {
-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm || !Asm->hasDebugInfo())
     return;
 
   // The COFF .debug$S section consists of several subsections, each starting
diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 6c70c47..de2263c 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -241,10 +241,7 @@ bool DebugHandlerBase::isUnsignedDIType(const DIType *Ty) {
          Ty->getTag() == dwarf::DW_TAG_unspecified_type;
 }
 
-static bool hasDebugInfo(const MachineModuleInfo *MMI,
-                         const MachineFunction *MF) {
-  if (!MMI->hasDebugInfo())
-    return false;
+static bool hasDebugInfo(const MachineFunction *MF) {
   auto *SP = MF->getFunction().getSubprogram();
   if (!SP)
     return false;
@@ -258,7 +255,7 @@ static bool hasDebugInfo(const MachineModuleInfo *MMI,
 void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
   PrevInstBB = nullptr;
 
-  if (!Asm || !hasDebugInfo(MMI, MF)) {
+  if (!Asm || !hasDebugInfo(MF)) {
     skippedNonDebugFunction();
     return;
   }
@@ -354,7 +351,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
 }
 
 void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm || !Asm->hasDebugInfo())
     return;
 
   assert(CurMI == nullptr);
@@ -380,7 +377,7 @@ void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
 }
 
 void DebugHandlerBase::endInstruction() {
-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm || !Asm->hasDebugInfo())
     return;
 
   assert(CurMI != nullptr);
@@ -415,7 +412,7 @@ void DebugHandlerBase::endInstruction() {
 }
 
 void DebugHandlerBase::endFunction(const MachineFunction *MF) {
-  if (Asm && hasDebugInfo(MMI, MF))
+  if (Asm && hasDebugInfo(MF))
     endFunctionImpl(MF);
   DbgValues.clear();
   DbgLabels.clear();
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 5f1f315..ac4d0f2 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -1148,14 +1148,15 @@ sortGlobalExprs(SmallVectorImpl<DwarfCompileUnit::GlobalExpr> &GVEs) {
 void DwarfDebug::beginModule(Module *M) {
   DebugHandlerBase::beginModule(M);
 
-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm)
     return;
 
   unsigned NumDebugCUs = std::distance(M->debug_compile_units_begin(),
                                        M->debug_compile_units_end());
+  if (NumDebugCUs == 0)
+    return;
+
   assert(NumDebugCUs > 0 && "Asm unexpectedly initialized");
-  assert(MMI->hasDebugInfo() &&
-         "DebugInfoAvailabilty unexpectedly not initialized");
   SingleCU = NumDebugCUs == 1;
   DenseMap<DIGlobalVariable *, SmallVector<DwarfCompileUnit::GlobalExpr, 1>>
       GVMap;
@@ -1433,7 +1434,7 @@ void DwarfDebug::endModule() {
 
   // If we aren't actually generating debug info (check beginModule -
   // conditionalized on the presence of the llvm.dbg.cu metadata node)
-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm || !Asm->hasDebugInfo())
     return;
 
   // Finalize the debug info for the module.
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index 92a03eb5..1dc2785 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -1891,7 +1891,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
   // Also avoid moving code above predicated instruction since it's hard to
   // reason about register liveness with predicated instruction.
   bool DontMoveAcrossStore = true;
-  if (!PI->isSafeToMove(nullptr, DontMoveAcrossStore) || TII->isPredicated(*PI))
+  if (!PI->isSafeToMove(DontMoveAcrossStore) || TII->isPredicated(*PI))
     return MBB->end();
 
   // Find out what registers are live. Note this routine is ignoring other live
@@ -2015,7 +2015,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       break;
 
     bool DontMoveAcrossStore = true;
-    if (!TIB->isSafeToMove(nullptr, DontMoveAcrossStore))
+    if (!TIB->isSafeToMove(DontMoveAcrossStore))
       break;
 
     // Remove kills from ActiveDefsSet, these registers had short live ranges.
diff --git a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
index 578854c..7fc25cd 100644
--- a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -92,7 +92,7 @@ bool DeadMachineInstructionElimImpl::isDead(const MachineInstr *MI) const {
 
   // Don't delete instructions with side effects.
   bool SawStore = false;
-  if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI())
+  if (!MI->isSafeToMove(SawStore) && !MI->isPHI())
     return false;
 
   // Examine each operand.
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index a5c9949..d506c62 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -235,7 +235,7 @@ bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
 
     // We never speculate stores, so an AA pointer isn't necessary.
     bool DontMoveAcrossStore = true;
-    if (!MI.isSafeToMove(nullptr, DontMoveAcrossStore)) {
+    if (!MI.isSafeToMove(DontMoveAcrossStore)) {
       LLVM_DEBUG(dbgs() << "Can't speculate: " << MI);
       return false;
     }
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index c906f3a..0806648 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -236,7 +236,7 @@ bool llvm::isTriviallyDead(const MachineInstr &MI,
   // If we can move an instruction, we can remove it.  Otherwise, it has
   // a side-effect of some sort.
   bool SawStore = false;
-  if (!MI.isSafeToMove(/*AA=*/nullptr, SawStore) && !MI.isPHI())
+  if (!MI.isSafeToMove(SawStore) && !MI.isPHI())
     return false;
 
   // Instructions without side-effects are dead iff they only define dead vregs.
diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp
index f378956..ba5605a 100644
--- a/llvm/lib/CodeGen/IfConversion.cpp
+++ b/llvm/lib/CodeGen/IfConversion.cpp
@@ -2097,7 +2097,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
 static bool MaySpeculate(const MachineInstr &MI,
                          SmallSet<MCPhysReg, 4> &LaterRedefs) {
   bool SawStore = true;
-  if (!MI.isSafeToMove(nullptr, SawStore))
+  if (!MI.isSafeToMove(SawStore))
     return false;
 
   for (const MachineOperand &MO : MI.operands()) {
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 7b7b545..c3c581d 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -238,7 +238,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   // We also need to make sure it is safe to move the load.
   // Assume there are stores between DefMI and UseMI.
   bool SawStore = true;
-  if (!DefMI->isSafeToMove(nullptr, SawStore))
+  if (!DefMI->isSafeToMove(SawStore))
     return false;
 
   LLVM_DEBUG(dbgs() << "Try to fold single def: " << *DefMI
@@ -300,7 +300,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
 
   // Use the same criteria as DeadMachineInstructionElim.
   bool SawStore = false;
-  if (!MI->isSafeToMove(nullptr, SawStore)) {
+  if (!MI->isSafeToMove(SawStore)) {
     LLVM_DEBUG(dbgs() << "Can't delete: " << Idx << '\t' << *MI);
     return;
   }
diff --git a/llvm/lib/CodeGen/LiveRangeShrink.cpp b/llvm/lib/CodeGen/LiveRangeShrink.cpp
index af7d6c4..3e3e3e5 100644
--- a/llvm/lib/CodeGen/LiveRangeShrink.cpp
+++ b/llvm/lib/CodeGen/LiveRangeShrink.cpp
@@ -153,7 +153,7 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
           }
       }
 
-      if (!MI.isSafeToMove(nullptr, SawStore)) {
+      if (!MI.isSafeToMove(SawStore)) {
         // If MI has side effects, it should become a barrier for code motion.
         // IOM is rebuild from the next instruction to prevent later
         // instructions from being moved before this MI.
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 7f6a752..40bde20 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -162,9 +162,9 @@ static inline Align getFnStackAlignment(const TargetSubtargetInfo *STI,
 }
 
 MachineFunction::MachineFunction(Function &F, const LLVMTargetMachine &Target,
-                                 const TargetSubtargetInfo &STI,
-                                 unsigned FunctionNum, MachineModuleInfo &mmi)
-    : F(F), Target(Target), STI(&STI), Ctx(mmi.getContext()), MMI(mmi) {
+                                 const TargetSubtargetInfo &STI, MCContext &Ctx,
+                                 unsigned FunctionNum)
+    : F(F), Target(Target), STI(&STI), Ctx(Ctx) {
   FunctionNumber = FunctionNum;
   init();
 }
@@ -654,9 +654,14 @@ void MachineFunction::print(raw_ostream &OS, const SlotIndexes *Indexes) const {
 
 /// True if this function needs frame moves for debug or exceptions.
 bool MachineFunction::needsFrameMoves() const {
-  return getMMI().hasDebugInfo() ||
-         getTarget().Options.ForceDwarfFrameSection ||
-         F.needsUnwindTableEntry();
+  // TODO: Ideally, what we'd like is to have a switch that allows emitting
+  // synchronous (precise at call-sites only) CFA into .eh_frame. However, even
+  // under this switch, we'd like .debug_frame to be precise when using -g. At
+  // this moment, there's no way to specify that some CFI directives go into
+  // .eh_frame only, while others go into .debug_frame only.
+  return getTarget().Options.ForceDwarfFrameSection ||
+         F.needsUnwindTableEntry() ||
+         !F.getParent()->debug_compile_units().empty();
 }
 
 namespace llvm {
diff --git a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
index 24eb360..e7a4d6d 100644
--- a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
@@ -37,7 +37,7 @@ MachineFunctionAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
                   .getCachedResult<MachineModuleAnalysis>(*F.getParent())
                   ->getMMI();
   auto MF = std::make_unique<MachineFunction>(
-      F, *TM, STI, Context.generateMachineFunctionNum(F), MMI);
+      F, *TM, STI, MMI.getContext(), Context.generateMachineFunctionNum(F));
   MF->initTargetMachineFunctionInfo(STI);
 
   // MRI callback for target specific initializations.
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index be64e9c..583bc1b 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1293,7 +1293,7 @@ void MachineInstr::substituteRegister(Register FromReg, Register ToReg,
 /// isSafeToMove - Return true if it is safe to move this instruction. If
 /// SawStore is set to true, it means that there is a store (or call) between
 /// the instruction's location and its intended destination.
-bool MachineInstr::isSafeToMove(AAResults *AA, bool &SawStore) const {
+bool MachineInstr::isSafeToMove(bool &SawStore) const {
   // Ignore stuff that we obviously can't move.
   //
   // Treat volatile loads as stores. This is not strictly necessary for
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d75df2a..1e4bf4b 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1075,7 +1075,7 @@ static bool isCopyFeedingInvariantStore(const MachineInstr &MI,
 bool MachineLICMBase::IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop) {
   // Check if it's safe to move the instruction.
   bool DontMoveAcrossStore = !HoistConstLoads || !AllowedToHoistLoads[CurLoop];
-  if ((!I.isSafeToMove(AA, DontMoveAcrossStore)) &&
+  if ((!I.isSafeToMove(DontMoveAcrossStore)) &&
       !(HoistConstStores && isInvariantStore(I, TRI, MRI))) {
     LLVM_DEBUG(dbgs() << "LICM: Instruction not safe to move.\n");
     return false;
diff --git a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
index 1f596cd..9d650e4 100644
--- a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
+++ b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
@@ -158,8 +158,7 @@ static bool isCandidate(const MachineInstr *MI, Register &DefedReg,
                         Register FrameReg) {
   DefedReg = MCRegister::NoRegister;
   bool SawStore = true;
-  if (!MI->isSafeToMove(nullptr, SawStore) || MI->isImplicitDef() ||
-      MI->isInlineAsm())
+  if (!MI->isSafeToMove(SawStore) || MI->isImplicitDef() || MI->isInlineAsm())
     return false;
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
diff --git a/llvm/lib/CodeGen/MachineModuleInfo.cpp b/llvm/lib/CodeGen/MachineModuleInfo.cpp
index 088e7602..c664959 100644
--- a/llvm/lib/CodeGen/MachineModuleInfo.cpp
+++ b/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -26,10 +26,7 @@ MachineModuleInfoImpl::~MachineModuleInfoImpl() = default;
 
 void MachineModuleInfo::initialize() {
   ObjFileMMI = nullptr;
-  CurCallSite = 0;
   NextFnNum = 0;
-  UsesMSVCFloatingPoint = false;
-  DbgInfoAvailable = false;
 }
 
 void MachineModuleInfo::finalize() {
@@ -47,7 +44,6 @@ MachineModuleInfo::MachineModuleInfo(MachineModuleInfo &&MMI)
       MachineFunctions(std::move(MMI.MachineFunctions)) {
   Context.setObjectFileInfo(TM.getObjFileLowering());
   ObjFileMMI = MMI.ObjFileMMI;
-  CurCallSite = MMI.CurCallSite;
   ExternalContext = MMI.ExternalContext;
   TheModule = MMI.TheModule;
 }
@@ -90,7 +86,7 @@ MachineFunction &MachineModuleInfo::getOrCreateMachineFunction(Function &F) {
   if (I.second) {
     // No pre-existing machine function, create a new one.
     const TargetSubtargetInfo &STI = *TM.getSubtargetImpl(F);
-    MF = new MachineFunction(F, TM, STI, NextFnNum++, *this);
+    MF = new MachineFunction(F, TM, STI, getContext(), NextFnNum++);
     MF->initTargetMachineFunctionInfo(STI);
 
     // MRI callback for target specific initializations.
@@ -210,7 +206,6 @@ bool MachineModuleInfoWrapperPass::doInitialization(Module &M) {
         Ctx.diagnose(
             DiagnosticInfoSrcMgr(SMD, M.getName(), IsInlineAsm, LocCookie));
       });
-  MMI.DbgInfoAvailable = !M.debug_compile_units().empty();
   return false;
 }
 
@@ -235,6 +230,5 @@ MachineModuleAnalysis::run(Module &M, ModuleAnalysisManager &) {
         Ctx.diagnose(
             DiagnosticInfoSrcMgr(SMD, M.getName(), IsInlineAsm, LocCookie));
       });
-  MMI.DbgInfoAvailable = !M.debug_compile_units().empty();
   return Result(MMI);
 }
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 4b3ff57..f10b98c 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -374,7 +374,7 @@ bool MachineSinking::PerformSinkAndFold(MachineInstr &MI,
 
   // Check if it's safe to move the instruction.
   bool SawStore = true;
-  if (!MI.isSafeToMove(AA, SawStore))
+  if (!MI.isSafeToMove(SawStore))
     return false;
 
   // Convergent operations may not be made control-dependent on additional
@@ -687,7 +687,7 @@ void MachineSinking::FindCycleSinkCandidates(
       continue;
     }
     bool DontMoveAcrossStore = true;
-    if (!MI.isSafeToMove(AA, DontMoveAcrossStore)) {
+    if (!MI.isSafeToMove(DontMoveAcrossStore)) {
       LLVM_DEBUG(dbgs() << "CycleSink: Instruction not safe to move.\n");
       continue;
     }
@@ -1654,7 +1654,7 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
     return false;
 
   // Check if it's safe to move the instruction.
-  if (!MI.isSafeToMove(AA, SawStore))
+  if (!MI.isSafeToMove(SawStore))
     return false;
 
   // Convergent operations may not be made control-dependent on additional
@@ -1705,7 +1705,7 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
     bool TryBreak = false;
     bool Store =
         MI.mayLoad() ? hasStoreBetween(ParentBlock, SuccToSinkTo, MI) : true;
-    if (!MI.isSafeToMove(AA, Store)) {
+    if (!MI.isSafeToMove(Store)) {
       LLVM_DEBUG(dbgs() << " *** NOTE: Won't sink load along critical edge.\n");
       TryBreak = true;
     }
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 0f29ebe..dae0cb2 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -739,7 +739,7 @@ void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
       bool SawStore = false;
       // Check if it's safe to remove the instruction due to side effects.
       // We can, and want to, remove Phis here.
-      if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
+      if (!MI->isSafeToMove(SawStore) && !MI->isPHI()) {
         ++MI;
         continue;
       }
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 043ea20..f6c53f3 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1320,7 +1320,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   if (!definesFullReg(*DefMI, SrcReg))
     return false;
   bool SawStore = false;
-  if (!DefMI->isSafeToMove(AA, SawStore))
+  if (!DefMI->isSafeToMove(SawStore))
     return false;
   const MCInstrDesc &MCID = DefMI->getDesc();
   if (MCID.getNumDefs() != 1)
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index ef9f783..e255bba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1203,11 +1203,6 @@ void FastISel::handleDbgInfo(const Instruction *II) {
 
     if (DbgLabelRecord *DLR = dyn_cast<DbgLabelRecord>(&DR)) {
       assert(DLR->getLabel() && "Missing label");
-      if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
-        LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DLR << "\n");
-        continue;
-      }
-
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DLR->getDebugLoc(),
               TII.get(TargetOpcode::DBG_LABEL))
           .addMetadata(DLR->getLabel());
@@ -1402,12 +1397,6 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
     assert(DI->getVariable() && "Missing variable");
-    if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
-      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI
-                        << " (!hasDebugInfo)\n");
-      return true;
-    }
-
     if (FuncInfo.PreprocessedDbgDeclares.contains(DI))
       return true;
 
@@ -1446,11 +1435,6 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
   case Intrinsic::dbg_label: {
     const DbgLabelInst *DI = cast<DbgLabelInst>(II);
     assert(DI->getLabel() && "Missing label");
-    if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
-      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
-      return true;
-    }
-
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
             TII.get(TargetOpcode::DBG_LABEL)).addMetadata(DI->getLabel());
     return true;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index af77b00..33a53df 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3395,13 +3395,13 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
       Lo = DAG.getNode(ISD::UADDO, dl, VTList, LoOps);
       HiOps[2] = Lo.getValue(1);
       Hi = DAG.computeKnownBits(HiOps[2]).isZero()
-               ? DAG.getNode(ISD::UADDO, dl, VTList, ArrayRef(HiOps, 2))
+               ? DAG.getNode(ISD::ADD, dl, NVT, ArrayRef(HiOps, 2))
                : DAG.getNode(ISD::UADDO_CARRY, dl, VTList, HiOps);
     } else {
       Lo = DAG.getNode(ISD::USUBO, dl, VTList, LoOps);
       HiOps[2] = Lo.getValue(1);
       Hi = DAG.computeKnownBits(HiOps[2]).isZero()
-               ? DAG.getNode(ISD::USUBO, dl, VTList, ArrayRef(HiOps, 2))
+               ? DAG.getNode(ISD::SUB, dl, NVT, ArrayRef(HiOps, 2))
                : DAG.getNode(ISD::USUBO_CARRY, dl, VTList, HiOps);
     }
     return;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 1791f1b..c554c0f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6708,10 +6708,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   case Intrinsic::eh_sjlj_callsite: {
     ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(0));
-    assert(DAG.getMMI()->getCurrentCallSite() == 0 &&
-           "Overlapping call sites!");
+    assert(FuncInfo.getCurrentCallSite() == 0 && "Overlapping call sites!");
 
-    DAG.getMMI()->setCurrentCallSite(CI->getZExtValue());
+    FuncInfo.setCurrentCallSite(CI->getZExtValue());
     return;
   }
   case Intrinsic::eh_sjlj_functioncontext: {
@@ -8619,7 +8618,6 @@ SDValue SelectionDAGBuilder::lowerStartEH(SDValue Chain,
                                           const BasicBlock *EHPadBB,
                                           MCSymbol *&BeginLabel) {
   MachineFunction &MF = DAG.getMachineFunction();
-  MachineModuleInfo &MMI = MF.getMMI();
 
   // Insert a label before the invoke call to mark the try range.  This can be
   // used to detect deletion of the invoke via the MachineModuleInfo.
@@ -8627,13 +8625,13 @@ SDValue SelectionDAGBuilder::lowerStartEH(SDValue Chain,
 
   // For SjLj, keep track of which landing pads go with which invokes
   // so as to maintain the ordering of pads in the LSDA.
-  unsigned CallSiteIndex = MMI.getCurrentCallSite();
+  unsigned CallSiteIndex = FuncInfo.getCurrentCallSite();
   if (CallSiteIndex) {
     MF.setCallSiteBeginLabel(BeginLabel, CallSiteIndex);
     LPadToCallSiteMap[FuncInfo.MBBMap[EHPadBB]].push_back(CallSiteIndex);
 
     // Now that the call site is handled, stop tracking it.
-    MMI.setCurrentCallSite(0);
+    FuncInfo.setCurrentCallSite(0);
   }
 
   return DAG.getEHLabel(getCurSDLoc(), Chain, BeginLabel);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 401d23b..84331d2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -417,30 +417,6 @@ void SelectionDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-static void computeUsesMSVCFloatingPoint(const Triple &TT, const Function &F,
-                                         MachineModuleInfo &MMI) {
-  // Only needed for MSVC
-  if (!TT.isWindowsMSVCEnvironment())
-    return;
-
-  // If it's already set, nothing to do.
-  if (MMI.usesMSVCFloatingPoint())
-    return;
-
-  for (const Instruction &I : instructions(F)) {
-    if (I.getType()->isFPOrFPVectorTy()) {
-      MMI.setUsesMSVCFloatingPoint(true);
-      return;
-    }
-    for (const auto &Op : I.operands()) {
-      if (Op->getType()->isFPOrFPVectorTy()) {
-        MMI.setUsesMSVCFloatingPoint(true);
-        return;
-      }
-    }
-  }
-}
-
 PreservedAnalyses
 SelectionDAGISelPass::run(MachineFunction &MF,
                           MachineFunctionAnalysisManager &MFAM) {
@@ -802,9 +778,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     }
   }
 
-  // Determine if floating point is used for msvc
-  computeUsesMSVCFloatingPoint(TM.getTargetTriple(), Fn, *CurDAG->getMMI());
-
   // Release function-specific state. SDB and CurDAG are already cleared
   // at this point.
   FuncInfo->clear();
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 6f7905c..fb6274b 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -948,7 +948,7 @@ bool TwoAddressInstructionImpl::rescheduleMIBelowKill(
     return false;
 
   bool SeenStore = true;
-  if (!MI->isSafeToMove(AA, SeenStore))
+  if (!MI->isSafeToMove(SeenStore))
     return false;
 
   if (TII->getInstrLatency(InstrItins, *MI) > 1)
@@ -1131,7 +1131,7 @@ bool TwoAddressInstructionImpl::rescheduleKillAboveMI(
     return false;
 
   bool SeenStore = true;
-  if (!KillMI->isSafeToMove(AA, SeenStore))
+  if (!KillMI->isSafeToMove(SeenStore))
     return false;
 
   SmallVector<Register, 2> Uses;
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index fc4d05c..49e5211 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -416,7 +416,7 @@ bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
 
     // We never speculate stores, so an AA pointer isn't necessary.
     bool DontMoveAcrossStore = true;
-    if (!I.isSafeToMove(nullptr, DontMoveAcrossStore)) {
+    if (!I.isSafeToMove(DontMoveAcrossStore)) {
       LLVM_DEBUG(dbgs() << "Can't speculate: " << I);
       return false;
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 63048c7..aaa2922 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2169,27 +2169,6 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
   return Ret;
 }
 
-static int sizeToSubRegIndex(unsigned Size) {
-  switch (Size) {
-  case 32:
-    return AMDGPU::sub0;
-  case 64:
-    return AMDGPU::sub0_sub1;
-  case 96:
-    return AMDGPU::sub0_sub1_sub2;
-  case 128:
-    return AMDGPU::sub0_sub1_sub2_sub3;
-  case 256:
-    return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
-  default:
-    if (Size < 32)
-      return AMDGPU::sub0;
-    if (Size > 256)
-      return -1;
-    return sizeToSubRegIndex(llvm::bit_ceil(Size));
-  }
-}
-
 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
   Register DstReg = I.getOperand(0).getReg();
   Register SrcReg = I.getOperand(1).getReg();
@@ -2293,8 +2272,9 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
     return false;
 
   if (SrcSize > 32) {
-    int SubRegIdx = sizeToSubRegIndex(DstSize);
-    if (SubRegIdx == -1)
+    unsigned SubRegIdx =
+        DstSize < 32 ? AMDGPU::sub0 : TRI.getSubRegFromChannel(0, DstSize / 32);
+    if (SubRegIdx == AMDGPU::NoSubRegister)
       return false;
 
     // Deal with weird cases where the class only partially supports the subreg
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index fc7aba3..c31f85d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -65,11 +65,6 @@ void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
     printU32ImmOperand(MI, OpNo, STI, O);
 }
 
-void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  O << formatDec(MI->getOperand(OpNo).getImm() & 0xf);
-}
-
 void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
                                               raw_ostream &O) {
   O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
@@ -719,29 +714,25 @@ void AMDGPUInstPrinter::printDefaultVccOperand(bool FirstOperand,
 void AMDGPUInstPrinter::printWaitVDST(const MCInst *MI, unsigned OpNo,
                                       const MCSubtargetInfo &STI,
                                       raw_ostream &O) {
-  O << " wait_vdst:";
-  printU4ImmDecOperand(MI, OpNo, O);
+  O << " wait_vdst:" << formatDec(MI->getOperand(OpNo).getImm());
 }
 
 void AMDGPUInstPrinter::printWaitVAVDst(const MCInst *MI, unsigned OpNo,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
-  O << " wait_va_vdst:";
-  printU4ImmDecOperand(MI, OpNo, O);
+  O << " wait_va_vdst:" << formatDec(MI->getOperand(OpNo).getImm());
 }
 
 void AMDGPUInstPrinter::printWaitVMVSrc(const MCInst *MI, unsigned OpNo,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
-  O << " wait_vm_vsrc:";
-  printU4ImmDecOperand(MI, OpNo, O);
+  O << " wait_vm_vsrc:" << formatDec(MI->getOperand(OpNo).getImm());
 }
 
 void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo,
                                     const MCSubtargetInfo &STI,
                                     raw_ostream &O) {
-  O << " wait_exp:";
-  printU4ImmDecOperand(MI, OpNo, O);
+  O << " wait_exp:" << formatDec(MI->getOperand(OpNo).getImm());
 }
 
 bool AMDGPUInstPrinter::needsImpliedVcc(const MCInstrDesc &Desc,
@@ -1065,16 +1056,13 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
     O << formatDec((Imm & 0xc0) >> 6) << ']';
   } else if ((Imm >= DppCtrl::ROW_SHL_FIRST) &&
              (Imm <= DppCtrl::ROW_SHL_LAST)) {
-    O << "row_shl:";
-    printU4ImmDecOperand(MI, OpNo, O);
+    O << "row_shl:" << formatDec(Imm - DppCtrl::ROW_SHL0);
   } else if ((Imm >= DppCtrl::ROW_SHR_FIRST) &&
              (Imm <= DppCtrl::ROW_SHR_LAST)) {
-    O << "row_shr:";
-    printU4ImmDecOperand(MI, OpNo, O);
+    O << "row_shr:" << formatDec(Imm - DppCtrl::ROW_SHR0);
   } else if ((Imm >= DppCtrl::ROW_ROR_FIRST) &&
              (Imm <= DppCtrl::ROW_ROR_LAST)) {
-    O << "row_ror:";
-    printU4ImmDecOperand(MI, OpNo, O);
+    O << "row_ror:" << formatDec(Imm - DppCtrl::ROW_ROR0);
   } else if (Imm == DppCtrl::WAVE_SHL1) {
     if (AMDGPU::isGFX10Plus(STI)) {
       O << "/* wave_shl is not supported starting from GFX10 */";
@@ -1126,15 +1114,14 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
            "than GFX90A/GFX10 */";
       return;
     }
-    printU4ImmDecOperand(MI, OpNo, O);
+    O << formatDec(Imm - DppCtrl::ROW_SHARE_FIRST);
   } else if ((Imm >= DppCtrl::ROW_XMASK_FIRST) &&
              (Imm <= DppCtrl::ROW_XMASK_LAST)) {
     if (!AMDGPU::isGFX10Plus(STI)) {
       O << "/* row_xmask is not supported on ASICs earlier than GFX10 */";
       return;
     }
-    O << "row_xmask:";
-    printU4ImmDecOperand(MI, OpNo, O);
+    O << "row_xmask:" << formatDec(Imm - DppCtrl::ROW_XMASK_FIRST);
   } else {
     O << "/* Invalid dpp_ctrl value */";
   }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index c5fad38..4a39022 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -38,7 +38,6 @@ public:
 private:
   void printU16ImmOperand(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
-  void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 4218b7d..3d6e8ef 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -950,7 +950,7 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
     (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), timm:$cachepolicy)),
     (!cast<InstSI>(Instr#"_IMM_ci") SReg_128:$sbase, smrd_literal_offset:$offset,
                                     (extract_cpol $cachepolicy))> {
-    let OtherPredicates = [isGFX7Only];
+    let SubtargetPredicate = isGFX7Only;
     let AddedComplexity = 1;
   }
 
@@ -958,12 +958,12 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
   def : GCNPat <
     (SIsbuffer_load v4i32:$sbase, i32:$soffset, timm:$cachepolicy),
     (vt (!cast<SM_Pseudo>(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$soffset, (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isNotGFX9Plus];
+    let SubtargetPredicate = isNotGFX9Plus;
   }
   def : GCNPat <
     (SIsbuffer_load v4i32:$sbase, i32:$soffset, timm:$cachepolicy),
     (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, 0, (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isGFX9Plus];
+    let SubtargetPredicate = isGFX9Plus;
   }
 
   // 4. Offset as an 32-bit SGPR + immediate
@@ -972,7 +972,7 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
                     timm:$cachepolicy),
     (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, i32imm:$offset,
                                              (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isGFX9Plus];
+    let SubtargetPredicate = isGFX9Plus;
   }
 }
 
@@ -981,28 +981,28 @@ multiclass ScalarLoadWithExtensionPat <string Instr, SDPatternOperator node, Val
    def : GCNPat <
      (node (SMRDImm i64:$sbase, i32:$offset)),
      (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))>{
-       let OtherPredicates = [isGFX12Plus];
+       let SubtargetPredicate = isGFX12Plus;
    }
 
    // 2. SGPR offset
    def : GCNPat <
      (node (SMRDSgpr i64:$sbase, i32:$soffset)),
      (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))>{
-       let OtherPredicates = [isGFX12Plus];
+       let SubtargetPredicate = isGFX12Plus;
    }
 
    // 3. SGPR+IMM offset
    def : GCNPat <
      (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
      (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))>{
-       let OtherPredicates = [isGFX12Plus];
+       let SubtargetPredicate = isGFX12Plus;
    }
 
    // 4. No offset
    def : GCNPat <
      (vt (node (i64 SReg_64:$sbase))),
      (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))>{
-       let OtherPredicates = [isGFX12Plus];
+       let SubtargetPredicate = isGFX12Plus;
   }
 }
 
@@ -1012,14 +1012,14 @@ multiclass ScalarBufferLoadIntrinsicPat <SDPatternOperator name, string Instr> {
   def : GCNPat <
     (name v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy),
     (i32 (!cast<SM_Pseudo>(Instr#"_IMM") SReg_128:$sbase, i32imm:$offset, (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isGFX12Plus];
+    let SubtargetPredicate = isGFX12Plus;
   }
 
   // 2. Offset as an 32-bit SGPR
   def : GCNPat <
     (name v4i32:$sbase, i32:$soffset, timm:$cachepolicy),
     (i32 (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, 0, (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isGFX12Plus];
+    let SubtargetPredicate = isGFX12Plus;
   }
 
   // 3. Offset as an 32-bit SGPR + immediate
@@ -1028,7 +1028,7 @@ multiclass ScalarBufferLoadIntrinsicPat <SDPatternOperator name, string Instr> {
                     timm:$cachepolicy),
     (i32 (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, i32imm:$offset,
                                              (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isGFX12Plus];
+    let SubtargetPredicate = isGFX12Plus;
   }
 }
 
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 18cb516..3b9195b 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -2311,7 +2311,7 @@ ARMBaseInstrInfo::canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI,
       return nullptr;
   }
   bool DontMoveAcrossStores = true;
-  if (!MI->isSafeToMove(/* AliasAnalysis = */ nullptr, DontMoveAcrossStores))
+  if (!MI->isSafeToMove(DontMoveAcrossStores))
     return nullptr;
   return MI;
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 6926b02..1922675 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -1037,7 +1037,7 @@ bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) {
     if (MI->isInlineAsm())
       continue;
     // Delete PHIs if possible.
-    if (!MI->isPHI() && !MI->isSafeToMove(nullptr, Store))
+    if (!MI->isPHI() && !MI->isSafeToMove(Store))
       continue;
 
     bool AllDead = true;
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index 8840c27..5e52cf0 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -1451,7 +1451,7 @@ bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
         Opc == TargetOpcode::LIFETIME_END)
       continue;
     bool Store = false;
-    if (MI->isInlineAsm() || !MI->isSafeToMove(nullptr, Store))
+    if (MI->isInlineAsm() || !MI->isSafeToMove(Store))
       continue;
 
     bool AllDead = true;
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
index b8a3743..9f3d9b7 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -483,7 +483,7 @@ static MachineInstr *canFoldIntoSelect(Register Reg,
       return nullptr;
   }
   bool DontMoveAcrossStores = true;
-  if (!MI->isSafeToMove(/*AliasAnalysis=*/nullptr, DontMoveAcrossStores))
+  if (!MI->isSafeToMove(DontMoveAcrossStores))
     return nullptr;
   return MI;
 }
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 208bd3d..f52e188 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -450,6 +450,24 @@ public:
                      IsValidKind;
   }
 
+  bool isSImm20pcaddi() const {
+    if (!isImm())
+      return false;
+
+    int64_t Imm;
+    LoongArchMCExpr::VariantKind VK = LoongArchMCExpr::VK_LoongArch_None;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    bool IsValidKind = VK == LoongArchMCExpr::VK_LoongArch_None ||
+                       VK == LoongArchMCExpr::VK_LoongArch_PCREL20_S2 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_LD_PCREL20_S2 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_GD_PCREL20_S2 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC_PCREL20_S2;
+    return IsConstantImm
+               ? isInt<20>(Imm) && IsValidKind
+               : LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
+                     IsValidKind;
+  }
+
   bool isSImm21lsl2() const {
     if (!isImm())
       return false;
@@ -1676,6 +1694,12 @@ bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         /*Upper=*/(1 << 19) - 1,
         "operand must be a symbol with modifier (e.g. %call36) or an integer "
         "in the range");
+  case Match_InvalidSImm20pcaddi:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, /*Lower=*/-(1 << 19),
+        /*Upper=*/(1 << 19) - 1,
+        "operand must be a symbol with modifier (e.g. %pcrel_20) or an integer "
+        "in the range");
   case Match_InvalidSImm21lsl2:
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, /*Lower=*/-(1 << 22), /*Upper=*/(1 << 22) - 4,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index ec0d0714..ef647a4 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -397,6 +397,10 @@ def simm20_pcaddu18i : SImm20Operand {
   let ParserMatchClass = SImmAsmOperand<20, "pcaddu18i">;
 }
 
+def simm20_pcaddi : SImm20Operand {
+  let ParserMatchClass = SImmAsmOperand<20, "pcaddi">;
+}
+
 def simm21_lsl2 : Operand<OtherVT> {
   let ParserMatchClass = SImmAsmOperand<21, "lsl2">;
   let EncoderMethod = "getImmOpValueAsr<2>";
@@ -754,7 +758,7 @@ def SLT  : ALU_3R<0x00120000>;
 def SLTU : ALU_3R<0x00128000>;
 def SLTI  : ALU_2RI12<0x02000000, simm12>;
 def SLTUI : ALU_2RI12<0x02400000, simm12>;
-def PCADDI    : ALU_1RI20<0x18000000, simm20>;
+def PCADDI    : ALU_1RI20<0x18000000, simm20_pcaddi>;
 def PCADDU12I : ALU_1RI20<0x1c000000, simm20>;
 def PCALAU12I : ALU_1RI20<0x1a000000, simm20_pcalau12i>;
 def AND  : ALU_3R<0x00148000>;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
index 29ed14f..370f5b0 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
@@ -111,6 +111,8 @@ enum Fixups {
   fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX,
   // Generate an R_LARCH_ALIGN which indicates the linker may fixup align here.
   fixup_loongarch_align = FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN,
+  // 20-bit fixup corresponding to %pcrel_20(foo) for instruction pcaddi.
+  fixup_loongarch_pcrel20_s2,
   // 36-bit fixup corresponding to %call36(foo) for a pair instructions:
   // pcaddu18i+jirl.
   fixup_loongarch_call36 = FirstLiteralRelocationKind + ELF::R_LARCH_CALL36,
@@ -142,6 +144,12 @@ enum Fixups {
   fixup_loongarch_tls_le_add_r,
   // 12-bit fixup corresponding to %le_lo12_r(foo) for instruction addi.w/d.
   fixup_loongarch_tls_le_lo12_r,
+  // 20-bit fixup corresponding to %ld_pcrel_20(foo) for instruction pcaddi.
+  fixup_loongarch_tls_ld_pcrel20_s2,
+  // 20-bit fixup corresponding to %gd_pcrel_20(foo) for instruction pcaddi.
+  fixup_loongarch_tls_gd_pcrel20_s2,
+  // 20-bit fixup corresponding to %desc_pcrel_20(foo) for instruction pcaddi.
+  fixup_loongarch_tls_desc_pcrel20_s2,
 };
 } // end namespace LoongArch
 } // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index efbfce3..4f7f93f 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -287,6 +287,18 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
     case LoongArchMCExpr::VK_LoongArch_TLS_LE_LO12_R:
       FixupKind = LoongArch::fixup_loongarch_tls_le_lo12_r;
       break;
+    case LoongArchMCExpr::VK_LoongArch_PCREL20_S2:
+      FixupKind = LoongArch::fixup_loongarch_pcrel20_s2;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_LD_PCREL20_S2:
+      FixupKind = LoongArch::fixup_loongarch_tls_ld_pcrel20_s2;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_GD_PCREL20_S2:
+      FixupKind = LoongArch::fixup_loongarch_tls_gd_pcrel20_s2;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_DESC_PCREL20_S2:
+      FixupKind = LoongArch::fixup_loongarch_tls_desc_pcrel20_s2;
+      break;
     }
   } else if (Kind == MCExpr::SymbolRef &&
              cast<MCSymbolRefExpr>(Expr)->getKind() ==
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
index 98b9b1a..53d46cc 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
@@ -166,6 +166,14 @@ StringRef LoongArchMCExpr::getVariantKindName(VariantKind Kind) {
     return "le_add_r";
   case VK_LoongArch_TLS_LE_LO12_R:
     return "le_lo12_r";
+  case VK_LoongArch_PCREL20_S2:
+    return "pcrel_20";
+  case VK_LoongArch_TLS_LD_PCREL20_S2:
+    return "ld_pcrel_20";
+  case VK_LoongArch_TLS_GD_PCREL20_S2:
+    return "gd_pcrel_20";
+  case VK_LoongArch_TLS_DESC_PCREL20_S2:
+    return "desc_pcrel_20";
   }
 }
 
@@ -222,6 +230,10 @@ LoongArchMCExpr::getVariantKindForName(StringRef name) {
       .Case("le_hi20_r", VK_LoongArch_TLS_LE_HI20_R)
       .Case("le_add_r", VK_LoongArch_TLS_LE_ADD_R)
       .Case("le_lo12_r", VK_LoongArch_TLS_LE_LO12_R)
+      .Case("pcrel_20", VK_LoongArch_PCREL20_S2)
+      .Case("ld_pcrel_20", VK_LoongArch_TLS_LD_PCREL20_S2)
+      .Case("gd_pcrel_20", VK_LoongArch_TLS_GD_PCREL20_S2)
+      .Case("desc_pcrel_20", VK_LoongArch_TLS_DESC_PCREL20_S2)
       .Default(VK_LoongArch_Invalid);
 }
 
@@ -264,6 +276,9 @@ void LoongArchMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
   case VK_LoongArch_TLS_GD_HI20:
   case VK_LoongArch_TLS_DESC_PC_HI20:
   case VK_LoongArch_TLS_DESC_HI20:
+  case VK_LoongArch_TLS_LD_PCREL20_S2:
+  case VK_LoongArch_TLS_GD_PCREL20_S2:
+  case VK_LoongArch_TLS_DESC_PCREL20_S2:
     break;
   }
   fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
index 1546d89..91215b8 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
@@ -75,6 +75,10 @@ public:
     VK_LoongArch_TLS_LE_HI20_R,
     VK_LoongArch_TLS_LE_ADD_R,
     VK_LoongArch_TLS_LE_LO12_R,
+    VK_LoongArch_PCREL20_S2,
+    VK_LoongArch_TLS_LD_PCREL20_S2,
+    VK_LoongArch_TLS_GD_PCREL20_S2,
+    VK_LoongArch_TLS_DESC_PCREL20_S2,
     VK_LoongArch_Invalid // Must be the last item.
   };
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 0b654ab..d7197a7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -493,7 +493,7 @@ void NVPTXAsmPrinter::emitFunctionEntryLabel() {
   // Emit initial .loc debug directive for correct relocation symbol data.
   if (const DISubprogram *SP = MF->getFunction().getSubprogram()) {
     assert(SP->getUnit());
-    if (!SP->getUnit()->isDebugDirectivesOnly() && MMI && MMI->hasDebugInfo())
+    if (!SP->getUnit()->isDebugDirectivesOnly())
       emitInitialRawDwarfLocDirective(*MF);
   }
 }
@@ -912,7 +912,7 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
     if (HasFullDebugInfo)
       break;
   }
-  if (MMI && MMI->hasDebugInfo() && HasFullDebugInfo)
+  if (HasFullDebugInfo)
     O << ", debug";
 
   O << "\n";
@@ -928,8 +928,6 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
 }
 
 bool NVPTXAsmPrinter::doFinalization(Module &M) {
-  bool HasDebugInfo = MMI && MMI->hasDebugInfo();
-
   // If we did not emit any functions, then the global declarations have not
   // yet been emitted.
   if (!GlobalsEmitted) {
@@ -945,7 +943,7 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
   auto *TS =
       static_cast<NVPTXTargetStreamer *>(OutStreamer->getTargetStreamer());
   // Close the last emitted section
-  if (HasDebugInfo) {
+  if (hasDebugInfo()) {
     TS->closeLastSection();
     // Emit empty .debug_loc section for better support of the empty files.
     OutStreamer->emitRawText("\t.section\t.debug_loc\t{\t}");
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 0e2811f..dcde863 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -3137,11 +3137,11 @@ void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) {
       break;
     MCSymbol *TempSym = OutContext.createNamedTempSymbol();
     OutStreamer->emitLabel(TempSym);
-    OutStreamer->emitXCOFFExceptDirective(CurrentFnSym, TempSym,
-                 LangMO.getImm(), ReasonMO.getImm(),
-                 Subtarget->isPPC64() ? MI->getMF()->getInstructionCount() * 8 :
-                 MI->getMF()->getInstructionCount() * 4,
-		 MMI->hasDebugInfo());
+    OutStreamer->emitXCOFFExceptDirective(
+        CurrentFnSym, TempSym, LangMO.getImm(), ReasonMO.getImm(),
+        Subtarget->isPPC64() ? MI->getMF()->getInstructionCount() * 8
+                             : MI->getMF()->getInstructionCount() * 4,
+        hasDebugInfo());
     break;
   }
   case PPC::GETtlsMOD32AIX:
@@ -3199,7 +3199,7 @@ void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
 bool PPCAIXAsmPrinter::doFinalization(Module &M) {
   // Do streamer related finalization for DWARF.
-  if (!MAI->usesDwarfFileAndLocDirectives() && MMI->hasDebugInfo())
+  if (!MAI->usesDwarfFileAndLocDirectives() && hasDebugInfo())
     OutStreamer->doFinalizationAtSectionEnd(
         OutStreamer->getContext().getObjectFileInfo()->getTextSection());
 
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 3c868db..2da03d8 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1370,9 +1370,7 @@ def HasConditionalMoveFusion : Predicate<"Subtarget->hasConditionalMoveFusion()"
 def NoConditionalMoveFusion  : Predicate<"!Subtarget->hasConditionalMoveFusion()">;
 
 def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",
-                                   "SiFive 7-Series processors",
-                                   [TuneNoDefaultUnroll,
-                                    TuneShortForwardBranchOpt]>;
+                                   "SiFive 7-Series processors">;
 
 def TuneVentanaVeyron : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "VentanaVeyron",
                                          "Ventana Veyron-Series processors">;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0339b30..41429ff 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -14309,6 +14309,14 @@ struct NodeExtensionHelper {
       return RISCVISD::VFWSUB_VL;
     case RISCVISD::FMUL_VL:
       return RISCVISD::VFWMUL_VL;
+    case RISCVISD::VFMADD_VL:
+      return RISCVISD::VFWMADD_VL;
+    case RISCVISD::VFMSUB_VL:
+      return RISCVISD::VFWMSUB_VL;
+    case RISCVISD::VFNMADD_VL:
+      return RISCVISD::VFWNMADD_VL;
+    case RISCVISD::VFNMSUB_VL:
+      return RISCVISD::VFWNMSUB_VL;
     default:
       llvm_unreachable("Unexpected opcode");
     }
@@ -14502,6 +14510,11 @@ struct NodeExtensionHelper {
              Subtarget.hasStdExtZvbb();
     case RISCVISD::SHL_VL:
       return Subtarget.hasStdExtZvbb();
+    case RISCVISD::VFMADD_VL:
+    case RISCVISD::VFNMSUB_VL:
+    case RISCVISD::VFNMADD_VL:
+    case RISCVISD::VFMSUB_VL:
+      return true;
     default:
       return false;
     }
@@ -14582,6 +14595,10 @@ struct NodeExtensionHelper {
     case RISCVISD::FADD_VL:
     case RISCVISD::FMUL_VL:
     case RISCVISD::VFWADD_W_VL:
+    case RISCVISD::VFMADD_VL:
+    case RISCVISD::VFNMSUB_VL:
+    case RISCVISD::VFNMADD_VL:
+    case RISCVISD::VFMSUB_VL:
       return true;
     case ISD::SUB:
     case RISCVISD::SUB_VL:
@@ -14797,6 +14814,10 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
     Strategies.push_back(canFoldToVW_W);
     break;
   case RISCVISD::FMUL_VL:
+  case RISCVISD::VFMADD_VL:
+  case RISCVISD::VFMSUB_VL:
+  case RISCVISD::VFNMADD_VL:
+  case RISCVISD::VFNMSUB_VL:
     Strategies.push_back(canFoldToVWWithSameExtension);
     break;
   case ISD::MUL:
@@ -14833,7 +14854,7 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
 }
 } // End anonymous namespace.
 
-/// Combine a binary operation to its equivalent VW or VW_W form.
+/// Combine a binary or FMA operation to its equivalent VW or VW_W form.
 /// The supported combines are:
 /// add | add_vl | or disjoint -> vwadd(u) | vwadd(u)_w
 /// sub | sub_vl -> vwsub(u) | vwsub(u)_w
@@ -14846,9 +14867,9 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
 /// vwsub_w(u) -> vwsub(u)
 /// vfwadd_w -> vfwadd
 /// vfwsub_w -> vfwsub
-static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
-                                           TargetLowering::DAGCombinerInfo &DCI,
-                                           const RISCVSubtarget &Subtarget) {
+static SDValue combineOp_VLToVWOp_VL(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const RISCVSubtarget &Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
   if (DCI.isBeforeLegalize())
     return SDValue();
@@ -14864,19 +14885,26 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
 
   while (!Worklist.empty()) {
     SDNode *Root = Worklist.pop_back_val();
-    if (!NodeExtensionHelper::isSupportedRoot(Root, Subtarget))
-      return SDValue();
 
     NodeExtensionHelper LHS(Root, 0, DAG, Subtarget);
     NodeExtensionHelper RHS(Root, 1, DAG, Subtarget);
-    auto AppendUsersIfNeeded = [&Worklist,
+    auto AppendUsersIfNeeded = [&Worklist, &Subtarget,
                                 &Inserted](const NodeExtensionHelper &Op) {
       if (Op.needToPromoteOtherUsers()) {
-        for (SDNode *TheUse : Op.OrigOperand->uses()) {
+        for (SDNode::use_iterator UI = Op.OrigOperand->use_begin(),
+                                  UE = Op.OrigOperand->use_end();
+             UI != UE; ++UI) {
+          SDNode *TheUse = *UI;
+          if (!NodeExtensionHelper::isSupportedRoot(TheUse, Subtarget))
+            return false;
+          // We only support the first 2 operands of FMA.
+          if (UI.getOperandNo() >= 2)
+            return false;
           if (Inserted.insert(TheUse).second)
             Worklist.push_back(TheUse);
         }
       }
+      return true;
     };
 
     // Control the compile time by limiting the number of node we look at in
@@ -14904,9 +14932,11 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
           // we would be leaving the old input (since it is may still be used),
           // and the new one.
           if (Res->LHSExt.has_value())
-            AppendUsersIfNeeded(LHS);
+            if (!AppendUsersIfNeeded(LHS))
+              return SDValue();
           if (Res->RHSExt.has_value())
-            AppendUsersIfNeeded(RHS);
+            if (!AppendUsersIfNeeded(RHS))
+              return SDValue();
           break;
         }
       }
@@ -14993,7 +15023,7 @@ static SDValue performVWADDSUBW_VLCombine(SDNode *N,
   assert(Opc == RISCVISD::VWADD_W_VL || Opc == RISCVISD::VWADDU_W_VL ||
          Opc == RISCVISD::VWSUB_W_VL || Opc == RISCVISD::VWSUBU_W_VL);
 
-  if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+  if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
     return V;
 
   return combineVWADDSUBWSelect(N, DCI.DAG);
@@ -15408,8 +15438,11 @@ static SDValue combineVFMADD_VLWithVFNEG_VL(SDNode *N, SelectionDAG &DAG) {
                      VL);
 }
 
-static SDValue performVFMADD_VLCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue performVFMADD_VLCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
                                        const RISCVSubtarget &Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+
   if (SDValue V = combineVFMADD_VLWithVFNEG_VL(N, DAG))
     return V;
 
@@ -15421,50 +15454,7 @@ static SDValue performVFMADD_VLCombine(SDNode *N, SelectionDAG &DAG,
   if (N->isTargetStrictFPOpcode())
     return SDValue();
 
-  // Try to form widening FMA.
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-  SDValue Mask = N->getOperand(3);
-  SDValue VL = N->getOperand(4);
-
-  if (Op0.getOpcode() != RISCVISD::FP_EXTEND_VL ||
-      Op1.getOpcode() != RISCVISD::FP_EXTEND_VL)
-    return SDValue();
-
-  // TODO: Refactor to handle more complex cases similar to
-  // combineBinOp_VLToVWBinOp_VL.
-  if ((!Op0.hasOneUse() || !Op1.hasOneUse()) &&
-      (Op0 != Op1 || !Op0->hasNUsesOfValue(2, 0)))
-    return SDValue();
-
-  // Check the mask and VL are the same.
-  if (Op0.getOperand(1) != Mask || Op0.getOperand(2) != VL ||
-      Op1.getOperand(1) != Mask || Op1.getOperand(2) != VL)
-    return SDValue();
-
-  unsigned NewOpc;
-  switch (N->getOpcode()) {
-  default:
-    llvm_unreachable("Unexpected opcode");
-  case RISCVISD::VFMADD_VL:
-    NewOpc = RISCVISD::VFWMADD_VL;
-    break;
-  case RISCVISD::VFNMSUB_VL:
-    NewOpc = RISCVISD::VFWNMSUB_VL;
-    break;
-  case RISCVISD::VFNMADD_VL:
-    NewOpc = RISCVISD::VFWNMADD_VL;
-    break;
-  case RISCVISD::VFMSUB_VL:
-    NewOpc = RISCVISD::VFWMSUB_VL;
-    break;
-  }
-
-  Op0 = Op0.getOperand(0);
-  Op1 = Op1.getOperand(0);
-
-  return DAG.getNode(NewOpc, SDLoc(N), N->getValueType(0), Op0, Op1,
-                     N->getOperand(2), Mask, VL);
+  return combineOp_VLToVWOp_VL(N, DCI, Subtarget);
 }
 
 static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
@@ -16661,28 +16651,28 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case ISD::ADD: {
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     if (SDValue V = combineToVWMACC(N, DAG, Subtarget))
       return V;
     return performADDCombine(N, DCI, Subtarget);
   }
   case ISD::SUB: {
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     return performSUBCombine(N, DAG, Subtarget);
   }
   case ISD::AND:
     return performANDCombine(N, DCI, Subtarget);
   case ISD::OR: {
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     return performORCombine(N, DCI, Subtarget);
   }
   case ISD::XOR:
     return performXORCombine(N, DAG, Subtarget);
   case ISD::MUL:
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     return performMULCombine(N, DAG, DCI, Subtarget);
   case ISD::SDIV:
@@ -17107,7 +17097,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case RISCVISD::SHL_VL:
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     [[fallthrough]];
   case RISCVISD::SRA_VL:
@@ -17132,7 +17122,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SRL:
   case ISD::SHL: {
     if (N->getOpcode() == ISD::SHL) {
-      if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+      if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
         return V;
     }
     SDValue ShAmt = N->getOperand(1);
@@ -17148,7 +17138,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case RISCVISD::ADD_VL:
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     return combineToVWMACC(N, DAG, Subtarget);
   case RISCVISD::VWADD_W_VL:
@@ -17158,7 +17148,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     return performVWADDSUBW_VLCombine(N, DCI, Subtarget);
   case RISCVISD::SUB_VL:
   case RISCVISD::MUL_VL:
-    return combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget);
+    return combineOp_VLToVWOp_VL(N, DCI, Subtarget);
   case RISCVISD::VFMADD_VL:
   case RISCVISD::VFNMADD_VL:
   case RISCVISD::VFMSUB_VL:
@@ -17167,7 +17157,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case RISCVISD::STRICT_VFNMADD_VL:
   case RISCVISD::STRICT_VFMSUB_VL:
   case RISCVISD::STRICT_VFNMSUB_VL:
-    return performVFMADD_VLCombine(N, DAG, Subtarget);
+    return performVFMADD_VLCombine(N, DCI, Subtarget);
   case RISCVISD::FADD_VL:
   case RISCVISD::FSUB_VL:
   case RISCVISD::FMUL_VL:
@@ -17176,7 +17166,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     if (N->getValueType(0).getVectorElementType() == MVT::f32 &&
         !Subtarget.hasVInstructionsF16())
       return SDValue();
-    return combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget);
+    return combineOp_VLToVWOp_VL(N, DCI, Subtarget);
   }
   case ISD::LOAD:
   case ISD::STORE: {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 1df51c1..0620c3f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1365,7 +1365,7 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg,
       return nullptr;
   }
   bool DontMoveAcrossStores = true;
-  if (!MI->isSafeToMove(/* AliasAnalysis = */ nullptr, DontMoveAcrossStores))
+  if (!MI->isSafeToMove(DontMoveAcrossStores))
     return nullptr;
   return MI;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 6eed2ae..1729bc0 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -83,9 +83,11 @@ def ROCKET_RV64 : RISCVProcessorModel<"rocket-rv64",
 def ROCKET : RISCVTuneProcessorModel<"rocket",
                                      RocketModel>;
 
+defvar SiFive7TuneFeatures = [TuneSiFive7, TuneNoDefaultUnroll,
+                              TuneShortForwardBranchOpt,
+                              FeaturePostRAScheduler];
 def SIFIVE_7 : RISCVTuneProcessorModel<"sifive-7-series",
-                                       SiFive7Model,
-                                       [TuneSiFive7, FeaturePostRAScheduler]>;
+                                       SiFive7Model, SiFive7TuneFeatures>;
 
 def SIFIVE_E20 : RISCVProcessorModel<"sifive-e20",
                                      RocketModel,
@@ -145,7 +147,7 @@ def SIFIVE_E76 : RISCVProcessorModel<"sifive-e76",
                                       FeatureStdExtA,
                                       FeatureStdExtF,
                                       FeatureStdExtC],
-                                     [TuneSiFive7, FeaturePostRAScheduler]>;
+                                     SiFive7TuneFeatures>;
 
 def SIFIVE_S21 : RISCVProcessorModel<"sifive-s21",
                                      RocketModel,
@@ -189,7 +191,7 @@ def SIFIVE_S76 : RISCVProcessorModel<"sifive-s76",
                                       FeatureStdExtD,
                                       FeatureStdExtC,
                                       FeatureStdExtZihintpause],
-                                     [TuneSiFive7, FeaturePostRAScheduler]>;
+                                     SiFive7TuneFeatures>;
 
 def SIFIVE_U54 : RISCVProcessorModel<"sifive-u54",
                                      RocketModel,
@@ -212,8 +214,11 @@ def SIFIVE_U74 : RISCVProcessorModel<"sifive-u74",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC],
-                                     [TuneSiFive7, FeaturePostRAScheduler]>;
+                                     SiFive7TuneFeatures>;
 
+defvar SiFiveX280TuneFeatures = !listconcat(SiFive7TuneFeatures,
+                                            [TuneDLenFactor2,
+                                             TuneOptimizedZeroStrideLoad]);
 def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
                                       [Feature64Bit,
                                        FeatureStdExtI,
@@ -229,10 +234,7 @@ def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
                                        FeatureStdExtZvfh,
                                        FeatureStdExtZba,
                                        FeatureStdExtZbb],
-                                      [TuneSiFive7,
-                                       FeaturePostRAScheduler,
-                                       TuneDLenFactor2,
-                                       TuneOptimizedZeroStrideLoad]>;
+                                      SiFiveX280TuneFeatures>;
 
 def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
                                       [Feature64Bit,
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index f328c55..20c014a 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -56,7 +56,9 @@ public:
         MachineFunctionProperties::Property::IsSSA);
   }
 
-  StringRef getPassName() const override { return "RISC-V Fold Masks"; }
+  StringRef getPassName() const override {
+    return "RISC-V Vector Peephole Optimization";
+  }
 
 private:
   bool convertToVLMAX(MachineInstr &MI) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index b2b8814..3833939 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -6653,7 +6653,8 @@ SDValue SystemZTargetLowering::combineTruncateExtract(
 
           // Defer the creation of the bitcast from X to combineExtract,
           // which might be able to optimize the extraction.
-          VecVT = MVT::getVectorVT(MVT::getIntegerVT(TruncBytes * 8),
+          VecVT = EVT::getVectorVT(*DCI.DAG.getContext(),
+                                   MVT::getIntegerVT(TruncBytes * 8),
                                    VecVT.getStoreSize() / TruncBytes);
           EVT ResVT = (TruncBytes < 4 ? MVT::i32 : TruncVT);
           return combineExtract(DL, ResVT, VecVT, Vec, NewIndex, DCI, true);
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 16bbfd4..d075889 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -631,8 +631,7 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
   DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
   assert(DefMI);
   bool SawStore = false;
-  if (!DefMI->isSafeToMove(nullptr, SawStore) ||
-      !MRI->hasOneNonDBGUse(FoldAsLoadDefReg))
+  if (!DefMI->isSafeToMove(SawStore) || !MRI->hasOneNonDBGUse(FoldAsLoadDefReg))
     return nullptr;
 
   int UseOpIdx =
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 0c2c6bf..957eb21 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGenTypes/MachineValueType.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
@@ -975,6 +976,33 @@ static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) {
   }
 }
 
+/// True if this module is being built for windows/msvc, and uses floating
+/// point. This is used to emit an undefined reference to _fltused. This is
+/// needed in Windows kernel or driver contexts to find and prevent code from
+/// modifying non-GPR registers.
+///
+/// TODO: It would be better if this was computed from MIR by looking for
+/// selected floating-point instructions.
+static bool usesMSVCFloatingPoint(const Triple &TT, const Module &M) {
+  // Only needed for MSVC
+  if (!TT.isWindowsMSVCEnvironment())
+    return false;
+
+  for (const Function &F : M) {
+    for (const Instruction &I : instructions(F)) {
+      if (I.getType()->isFPOrFPVectorTy())
+        return true;
+
+      for (const auto &Op : I.operands()) {
+        if (Op->getType()->isFPOrFPVectorTy())
+          return true;
+      }
+    }
+  }
+
+  return false;
+}
+
 void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
   const Triple &TT = TM.getTargetTriple();
 
@@ -993,7 +1021,7 @@ void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
     // safe to set.
     OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   } else if (TT.isOSBinFormatCOFF()) {
-    if (MMI->usesMSVCFloatingPoint()) {
+    if (usesMSVCFloatingPoint(TT, M)) {
       // In Windows' libcmt.lib, there is a file which is linked in only if the
       // symbol _fltused is referenced. Linking this in causes some
       // side-effects:
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index f3e6aaf..918a608 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -5588,7 +5588,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
   DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
   assert(DefMI);
   bool SawStore = false;
-  if (!DefMI->isSafeToMove(nullptr, SawStore))
+  if (!DefMI->isSafeToMove(SawStore))
     return nullptr;
 
   // Collect information about virtual register operands of MI.
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index 63ac910..697d30a 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -792,10 +792,10 @@ def BWWriteResGroup28 : SchedWriteRes<[BWPort5]> {
 def: InstRW<[BWWriteResGroup28], (instrs VPBROADCASTBrr,
                                          VPBROADCASTWrr)>;
 
-def BWWriteResGroup33 : SchedWriteRes<[BWPort5,BWPort0156]> {
+def BWWriteResGroup33 : SchedWriteRes<[BWPort5]> {
   let Latency = 3;
-  let NumMicroOps = 3;
-  let ReleaseAtCycles = [2,1];
+  let NumMicroOps = 2;
+  let ReleaseAtCycles = [2];
 }
 def: InstRW<[BWWriteResGroup33], (instrs MMX_PACKSSDWrr,
                                          MMX_PACKSSWBrr,
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index 516dc62..c4d2ad7 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -1247,10 +1247,10 @@ def: InstRW<[HWWriteResGroup53_1], (instrs VPMOVSXBWYrm,
                                            VPMOVSXWDYrm,
                                            VPMOVZXWDYrm)>;
 
-def HWWriteResGroup57 : SchedWriteRes<[HWPort5,HWPort0156]> {
+def HWWriteResGroup57 : SchedWriteRes<[HWPort5]> {
   let Latency = 3;
-  let NumMicroOps = 3;
-  let ReleaseAtCycles = [2,1];
+  let NumMicroOps = 2;
+  let ReleaseAtCycles = [2];
 }
 def: InstRW<[HWWriteResGroup57], (instrs MMX_PACKSSDWrr,
                                          MMX_PACKSSWBrr,
diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td
index 7b33aed..6966400 100644
--- a/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -638,7 +638,8 @@ def: InstRW<[SBWriteResGroup5], (instrs MMX_PABSBrr,
                                         MMX_PALIGNRrri,
                                         MMX_PSIGNBrr,
                                         MMX_PSIGNDrr,
-                                        MMX_PSIGNWrr)>;
+                                        MMX_PSIGNWrr,
+                                        MMX_PSUBQrr)>;
 
 def SBWriteResGroup11 : SchedWriteRes<[SBPort015]> {
   let Latency = 2;
@@ -898,7 +899,8 @@ def SBWriteResGroup59 : SchedWriteRes<[SBPort23,SBPort15]> {
   let NumMicroOps = 2;
   let ReleaseAtCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup59], (instrs MMX_PADDQrm)>;
+def: InstRW<[SBWriteResGroup59], (instrs MMX_PADDQrm,
+                                         MMX_PSUBQrm)>;
 
 def SBWriteResGroup62 : SchedWriteRes<[SBPort5,SBPort23]> {
   let Latency = 7;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 3ee931f..d764cb3 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -404,7 +404,7 @@ defm : SKLWriteResPair<WritePSADBW,  [SKLPort5], 3, [1], 1, 5>; // Vector PSADBW
 defm : SKLWriteResPair<WritePSADBWX, [SKLPort5], 3, [1], 1, 6>;
 defm : SKLWriteResPair<WritePSADBWY, [SKLPort5], 3, [1], 1, 7>;
 defm : X86WriteResPairUnsupported<WritePSADBWZ>;
-defm : SKLWriteResPair<WritePHMINPOS, [SKLPort01], 4, [1], 1, 6>; // Vector PHMINPOS.
+defm : SKLWriteResPair<WritePHMINPOS, [SKLPort0], 4, [1], 1, 6>; // Vector PHMINPOS.
 
 // Vector integer shifts.
 defm : SKLWriteResPair<WriteVecShift,     [SKLPort0], 1, [1], 1, 5>;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 161fe13..f710456 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3596,6 +3596,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::SMAX,       MVT::v64i8,   {  1,  1,  1,  1 } },
     { ISD::SMIN,       MVT::v32i16,  {  1,  1,  1,  1 } },
     { ISD::SMIN,       MVT::v64i8,   {  1,  1,  1,  1 } },
+    { ISD::SMULO,      MVT::v32i16,  {  3,  6,  4,  4 } },
+    { ISD::SMULO,      MVT::v64i8,   {  8, 21, 17, 18 } },
+    { ISD::UMULO,      MVT::v32i16,  {  2,  5,  3,  3 } },
+    { ISD::UMULO,      MVT::v64i8,   {  8, 15, 15, 16 } },
     { ISD::SSUBSAT,    MVT::v32i16,  {  1,  1,  1,  1 } },
     { ISD::SSUBSAT,    MVT::v64i8,   {  1,  1,  1,  1 } },
     { ISD::UADDSAT,    MVT::v32i16,  {  1,  1,  1,  1 } },
@@ -3674,8 +3678,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::SMIN,       MVT::v64i8,   {  3,  7,  5,  5 } },
     { ISD::SMIN,       MVT::v4i64,   {  1,  3,  1,  1 } },
     { ISD::SMIN,       MVT::v2i64,   {  1,  3,  1,  1 } },
+    { ISD::SMULO,      MVT::v8i64,   { 44, 44, 81, 93 } },
+    { ISD::SMULO,      MVT::v16i32,  {  5, 12,  9, 11 } },
     { ISD::SSUBSAT,    MVT::v2i64,   {  2, 13,  9, 10 } },
-    { ISD::SSUBSAT,    MVT::v4i64,   {  2, 15,  7,  8} },
+    { ISD::SSUBSAT,    MVT::v4i64,   {  2, 15,  7,  8 } },
     { ISD::SSUBSAT,    MVT::v8i64,   {  2, 14,  7,  8 } },
     { ISD::SSUBSAT,    MVT::v4i32,   {  2, 14,  7,  8 } },
     { ISD::SSUBSAT,    MVT::v8i32,   {  2, 15,  7,  8 } },
@@ -3694,6 +3700,8 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UMIN,       MVT::v64i8,   {  3,  7,  5,  5 } },
     { ISD::UMIN,       MVT::v4i64,   {  1,  3,  1,  1 } },
     { ISD::UMIN,       MVT::v2i64,   {  1,  3,  1,  1 } },
+    { ISD::UMULO,      MVT::v8i64,   { 52, 52, 95, 104} },
+    { ISD::UMULO,      MVT::v16i32,  {  5, 12,  8, 10 } },
     { ISD::UADDSAT,    MVT::v2i64,   {  1,  4,  4,  4 } },
     { ISD::UADDSAT,    MVT::v4i64,   {  1,  4,  4,  4 } },
     { ISD::UADDSAT,    MVT::v8i64,   {  1,  4,  4,  4 } },
@@ -3828,6 +3836,14 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::SMIN,       MVT::v8i32,   {  1,  1,  1,  2 } },
     { ISD::SMIN,       MVT::v16i16,  {  1,  1,  1,  2 } },
     { ISD::SMIN,       MVT::v32i8,   {  1,  1,  1,  2 } },
+    { ISD::SMULO,      MVT::v4i64,   { 20, 20, 33, 37 } },
+    { ISD::SMULO,      MVT::v2i64,   {  8,  8, 13, 15 } },
+    { ISD::SMULO,      MVT::v8i32,   {  8, 20, 13, 24 } },
+    { ISD::SMULO,      MVT::v4i32,   {  5, 15, 11, 12 } },
+    { ISD::SMULO,      MVT::v16i16,  {  4, 14,  8, 14 } },
+    { ISD::SMULO,      MVT::v8i16,   {  3,  9,  6,  6 } },
+    { ISD::SMULO,      MVT::v32i8,   {  9, 15, 18, 35 } },
+    { ISD::SMULO,      MVT::v16i8,   {  6, 22, 14, 21 } },
     { ISD::SSUBSAT,    MVT::v2i64,   {  4, 13,  9, 13 } },
     { ISD::SSUBSAT,    MVT::v4i64,   {  4, 15,  9, 13 } },
     { ISD::SSUBSAT,    MVT::v4i32,   {  3, 14,  9, 11 } },
@@ -3849,6 +3865,14 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UMIN,       MVT::v8i32,   {  1,  1,  1,  2 } },
     { ISD::UMIN,       MVT::v16i16,  {  1,  1,  1,  2 } },
     { ISD::UMIN,       MVT::v32i8,   {  1,  1,  1,  2 } },
+    { ISD::UMULO,      MVT::v4i64,   { 24, 24, 39, 43 } },
+    { ISD::UMULO,      MVT::v2i64,   { 10, 10, 15, 19 } },
+    { ISD::UMULO,      MVT::v8i32,   {  8, 11, 13, 23 } },
+    { ISD::UMULO,      MVT::v4i32,   {  5, 12, 11, 12 } },
+    { ISD::UMULO,      MVT::v16i16,  {  4,  6,  8, 13 } },
+    { ISD::UMULO,      MVT::v8i16,   {  2,  8,  6,  6 } },
+    { ISD::UMULO,      MVT::v32i8,   {  9, 13, 17, 33 } },
+    { ISD::UMULO,      MVT::v16i8,   {  6, 19, 13, 20 } },
     { ISD::USUBSAT,    MVT::v2i64,   {  2,  7,  6,  6 } },
     { ISD::USUBSAT,    MVT::v4i64,   {  3,  7,  6, 10 } },
     { ISD::USUBSAT,    MVT::v8i32,   {  2,  2,  2,  4 } },
@@ -3925,6 +3949,14 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::SMIN,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::SMIN,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::SMIN,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::SMULO,      MVT::v4i64,   { 20, 20, 33, 37 } },
+    { ISD::SMULO,      MVT::v2i64,   {  9,  9, 13, 17 } },
+    { ISD::SMULO,      MVT::v8i32,   { 15, 20, 24, 29 } },
+    { ISD::SMULO,      MVT::v4i32,   {  7, 15, 11, 13 } },
+    { ISD::SMULO,      MVT::v16i16,  {  8, 14, 14, 15 } },
+    { ISD::SMULO,      MVT::v8i16,   {  3,  9,  6,  6 } },
+    { ISD::SMULO,      MVT::v32i8,   { 20, 20, 37, 39 } },
+    { ISD::SMULO,      MVT::v16i8,   {  9, 22, 18, 21 } },
     { ISD::SSUBSAT,    MVT::v2i64,   {  7, 13,  9, 13 } },
     { ISD::SSUBSAT,    MVT::v4i64,   { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
     { ISD::SSUBSAT,    MVT::v8i32,   { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
@@ -3945,6 +3977,14 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UMIN,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::UMIN,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::UMIN,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::UMULO,      MVT::v4i64,   { 24, 26, 39, 45 } },
+    { ISD::UMULO,      MVT::v2i64,   { 10, 12, 15, 20 } },
+    { ISD::UMULO,      MVT::v8i32,   { 14, 15, 23, 28 } },
+    { ISD::UMULO,      MVT::v4i32,   {  7, 12, 11, 13 } },
+    { ISD::UMULO,      MVT::v16i16,  {  7, 11, 13, 14 } },
+    { ISD::UMULO,      MVT::v8i16,   {  3,  8,  6,  6 } },
+    { ISD::UMULO,      MVT::v32i8,   { 19, 19, 35, 37 } },
+    { ISD::UMULO,      MVT::v16i8,   {  9, 19, 17, 20 } },
     { ISD::USUBSAT,    MVT::v2i64,   {  3,  7,  6,  6 } },
     { ISD::USUBSAT,    MVT::v4i64,   {  8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
     { ISD::USUBSAT,    MVT::v8i32,   {  4,  4,  7,  8 } }, // 2 x 128-bit Op + extract/insert
@@ -4020,6 +4060,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::SMIN,       MVT::v2i64,   {  3,  7,  2,  3 } },
     { ISD::SMIN,       MVT::v4i32,   {  1,  1,  1,  1 } },
     { ISD::SMIN,       MVT::v16i8,   {  1,  1,  1,  1 } },
+    { ISD::SMULO,      MVT::v2i64,   {  9, 11, 13, 17 } },
+    { ISD::SMULO,      MVT::v4i32,   { 20, 24, 13, 19 } },
+    { ISD::SMULO,      MVT::v8i16,   {  5,  9,  8,  8 } },
+    { ISD::SMULO,      MVT::v16i8,   { 13, 22, 24, 25 } },
     { ISD::UADDSAT,    MVT::v2i64,   {  6, 13, 14, 14 } },
     { ISD::UADDSAT,    MVT::v4i32,   {  2,  2,  4,  4 } },
     { ISD::USUBSAT,    MVT::v2i64,   {  6, 10, 14, 14 } },
@@ -4030,6 +4074,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UMIN,       MVT::v2i64,   {  2, 11,  6,  7 } },
     { ISD::UMIN,       MVT::v4i32,   {  1,  1,  1,  1 } },
     { ISD::UMIN,       MVT::v8i16,   {  1,  1,  1,  1 } },
+    { ISD::UMULO,      MVT::v2i64,   { 14, 20, 15, 20 } },
+    { ISD::UMULO,      MVT::v4i32,   { 19, 22, 12, 18 } },
+    { ISD::UMULO,      MVT::v8i16,   {  4,  9,  7,  7 } },
+    { ISD::UMULO,      MVT::v16i8,   { 13, 19, 18, 20 } },
   };
   static const CostKindTblEntry SSSE3CostTbl[] = {
     { ISD::ABS,        MVT::v4i32,   {  1,  2,  1,  1 } },
@@ -4091,6 +4139,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::SMIN,       MVT::v4i32,   {  2,  4,  5,  5 } },
     { ISD::SMIN,       MVT::v8i16,   {  1,  1,  1,  1 } },
     { ISD::SMIN,       MVT::v16i8,   {  2,  4,  5,  5 } },
+    { ISD::SMULO,      MVT::v2i64,   { 30, 33, 13, 23 } },
+    { ISD::SMULO,      MVT::v4i32,   { 20, 24, 23, 23 } },
+    { ISD::SMULO,      MVT::v8i16,   {  5, 10,  8,  8 } },
+    { ISD::SMULO,      MVT::v16i8,   { 13, 23, 24, 25 } },
     { ISD::SSUBSAT,    MVT::v2i64,   { 16, 19, 31, 31 } },
     { ISD::SSUBSAT,    MVT::v4i32,   {  6, 14, 12, 13 } },
     { ISD::SSUBSAT,    MVT::v8i16,   {  1,  2,  1,  1 } },
@@ -4107,6 +4159,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UMIN,       MVT::v4i32,   {  2,  5,  8,  8 } },
     { ISD::UMIN,       MVT::v8i16,   {  1,  3,  3,  3 } },
     { ISD::UMIN,       MVT::v16i8,   {  1,  1,  1,  1 } },
+    { ISD::UMULO,      MVT::v2i64,   { 30, 33, 15, 29 } },
+    { ISD::UMULO,      MVT::v4i32,   { 19, 22, 14, 18 } },
+    { ISD::UMULO,      MVT::v8i16,   {  4,  9,  7,  7 } },
+    { ISD::UMULO,      MVT::v16i8,   { 13, 19, 20, 20 } },
     { ISD::USUBSAT,    MVT::v2i64,   {  7, 10, 14, 14 } },
     { ISD::USUBSAT,    MVT::v4i32,   {  4,  4,  7,  7 } },
     { ISD::USUBSAT,    MVT::v8i16,   {  1,  2,  1,  1 } },
diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
index b382fed..832506f 100644
--- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
@@ -1725,6 +1725,9 @@ Value *NumericalStabilitySanitizer::createShadowValueWithOperandsAvailable(
                                 Map.getShadow(S->getTrueValue()),
                                 Map.getShadow(S->getFalseValue()));
 
+  if (auto *Freeze = dyn_cast<FreezeInst>(&Inst))
+    return Builder.CreateFreeze(Map.getShadow(Freeze->getOperand(0)));
+
   if (auto *Extract = dyn_cast<ExtractElementInst>(&Inst))
     return Builder.CreateExtractElement(
         Map.getShadow(Extract->getVectorOperand()), Extract->getIndexOperand());
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 91ef2b4..329b3ef 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -113,6 +113,8 @@ STATISTIC(NumFPAssociationsHoisted, "Number of invariant FP expressions "
 STATISTIC(NumIntAssociationsHoisted,
           "Number of invariant int expressions "
           "reassociated and hoisted out of the loop");
+STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions "
+                                    "reassociated and hoisted out of the loop");
 
 /// Memory promotion is enabled by default.
 static cl::opt<bool>
@@ -2779,6 +2781,68 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
   return true;
 }
 
+/// Reassociate associative binary expressions of the form
+///
+/// 1. "(LV op C1) op C2" ==> "LV op (C1 op C2)"
+///
+/// where op is an associative binary op, LV is a loop variant, and C1 and C2
+/// are loop invariants that we want to hoist.
+///
+/// TODO: This can be extended to more cases such as
+/// 2. "C1 op (C2 op LV)" ==> "(C1 op C2) op LV"
+/// 3. "(C1 op LV) op C2" ==> "LV op (C1 op C2)" if op is commutative
+/// 4. "C1 op (LV op C2)" ==> "(C1 op C2) op LV" if op is commutative
+static bool hoistBOAssociation(Instruction &I, Loop &L,
+                               ICFLoopSafetyInfo &SafetyInfo,
+                               MemorySSAUpdater &MSSAU, AssumptionCache *AC,
+                               DominatorTree *DT) {
+  auto *BO = dyn_cast<BinaryOperator>(&I);
+  if (!BO || !BO->isAssociative())
+    return false;
+
+  // Only fold ADDs for now.
+  Instruction::BinaryOps Opcode = BO->getOpcode();
+  if (Opcode != Instruction::Add)
+    return false;
+
+  auto *BO0 = dyn_cast<BinaryOperator>(BO->getOperand(0));
+  if (!BO0 || BO0->getOpcode() != Opcode || !BO0->isAssociative())
+    return false;
+
+  // Transform: "(LV op C1) op C2" ==> "LV op (C1 op C2)"
+  Value *LV = BO0->getOperand(0);
+  Value *C1 = BO0->getOperand(1);
+  Value *C2 = BO->getOperand(1);
+
+  if (L.isLoopInvariant(LV) || !L.isLoopInvariant(C1) || !L.isLoopInvariant(C2))
+    return false;
+
+  auto *Preheader = L.getLoopPreheader();
+  assert(Preheader && "Loop is not in simplify form?");
+
+  auto *Inv = BinaryOperator::Create(Opcode, C1, C2, "invariant.op",
+                                     Preheader->getTerminator());
+  auto *NewBO =
+      BinaryOperator::Create(Opcode, LV, Inv, BO->getName() + ".reass", BO);
+
+  // Copy NUW for ADDs if both instructions have it.
+  if (Opcode == Instruction::Add && BO->hasNoUnsignedWrap() &&
+      BO0->hasNoUnsignedWrap()) {
+    Inv->setHasNoUnsignedWrap(true);
+    NewBO->setHasNoUnsignedWrap(true);
+  }
+
+  BO->replaceAllUsesWith(NewBO);
+  eraseInstruction(*BO, SafetyInfo, MSSAU);
+
+  // (LV op C1) might not be erased if it has more uses than the one we just
+  // replaced.
+  if (BO0->use_empty())
+    eraseInstruction(*BO0, SafetyInfo, MSSAU);
+
+  return true;
+}
+
 static bool hoistArithmetics(Instruction &I, Loop &L,
                              ICFLoopSafetyInfo &SafetyInfo,
                              MemorySSAUpdater &MSSAU, AssumptionCache *AC,
@@ -2816,6 +2880,12 @@ static bool hoistArithmetics(Instruction &I, Loop &L,
     return true;
   }
 
+  if (hoistBOAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) {
+    ++NumHoisted;
+    ++NumBOAssociationsHoisted;
+    return true;
+  }
+
   return false;
 }
 
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index b38db41..ee0d95b 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -347,6 +347,30 @@ void llvm::createMemCpyLoopUnknownSize(
   }
 }
 
+// If \p Addr1 and \p Addr2 are pointers to different address spaces, create an
+// addresspacecast to obtain a pair of pointers in the same addressspace. The
+// caller needs to ensure that addrspacecasting is possible.
+// No-op if the pointers are in the same address space.
+static std::pair<Value *, Value *>
+tryInsertCastToCommonAddrSpace(IRBuilderBase &B, Value *Addr1, Value *Addr2,
+                               const TargetTransformInfo &TTI) {
+  Value *ResAddr1 = Addr1;
+  Value *ResAddr2 = Addr2;
+
+  unsigned AS1 = cast<PointerType>(Addr1->getType())->getAddressSpace();
+  unsigned AS2 = cast<PointerType>(Addr2->getType())->getAddressSpace();
+  if (AS1 != AS2) {
+    if (TTI.isValidAddrSpaceCast(AS2, AS1))
+      ResAddr2 = B.CreateAddrSpaceCast(Addr2, Addr1->getType());
+    else if (TTI.isValidAddrSpaceCast(AS1, AS2))
+      ResAddr1 = B.CreateAddrSpaceCast(Addr1, Addr2->getType());
+    else
+      llvm_unreachable("Can only lower memmove between address spaces if they "
+                       "support addrspacecast");
+  }
+  return {ResAddr1, ResAddr2};
+}
+
 // Lower memmove to IR. memmove is required to correctly copy overlapping memory
 // regions; therefore, it has to check the relative positions of the source and
 // destination pointers and choose the copy direction accordingly.
@@ -369,17 +393,61 @@ void llvm::createMemCpyLoopUnknownSize(
 //   }
 //   return dst;
 // }
-static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
-                              Value *DstAddr, Value *CopyLen, Align SrcAlign,
-                              Align DstAlign, bool SrcIsVolatile,
-                              bool DstIsVolatile,
-                              const TargetTransformInfo &TTI) {
+//
+// If the TargetTransformInfo specifies a wider MemcpyLoopLoweringType, it is
+// used for the memory accesses in the loops. Then, additional loops with
+// byte-wise accesses are added for the remaining bytes.
+static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
+                                         Value *SrcAddr, Value *DstAddr,
+                                         Value *CopyLen, Align SrcAlign,
+                                         Align DstAlign, bool SrcIsVolatile,
+                                         bool DstIsVolatile,
+                                         const TargetTransformInfo &TTI) {
   Type *TypeOfCopyLen = CopyLen->getType();
   BasicBlock *OrigBB = InsertBefore->getParent();
   Function *F = OrigBB->getParent();
   const DataLayout &DL = F->getDataLayout();
-  // TODO: Use different element type if possible?
-  Type *EltTy = Type::getInt8Ty(F->getContext());
+  LLVMContext &Ctx = OrigBB->getContext();
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
+  unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+  Type *Int8Type = Type::getInt8Ty(Ctx);
+  bool LoopOpIsInt8 = LoopOpType == Int8Type;
+
+  // If the memory accesses are wider than one byte, residual loops with
+  // i8-accesses are required to move remaining bytes.
+  bool RequiresResidual = !LoopOpIsInt8;
+
+  Type *ResidualLoopOpType = Int8Type;
+  unsigned ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
+
+  // Calculate the loop trip count and remaining bytes to copy after the loop.
+  IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
+  ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
+  ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
+  ConstantInt *One = ConstantInt::get(ILengthType, 1);
+
+  IRBuilder<> PLBuilder(InsertBefore);
+
+  Value *RuntimeLoopCount = CopyLen;
+  Value *RuntimeLoopRemainder = nullptr;
+  Value *RuntimeBytesCopiedMainLoop = CopyLen;
+  Value *SkipResidualCondition = nullptr;
+  if (RequiresResidual) {
+    RuntimeLoopCount =
+        getRuntimeLoopCount(DL, PLBuilder, CopyLen, CILoopOpSize, LoopOpSize);
+    RuntimeLoopRemainder = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
+                                                   CILoopOpSize, LoopOpSize);
+    RuntimeBytesCopiedMainLoop =
+        PLBuilder.CreateSub(CopyLen, RuntimeLoopRemainder);
+    SkipResidualCondition =
+        PLBuilder.CreateICmpEQ(RuntimeLoopRemainder, Zero, "skip_residual");
+  }
+  Value *SkipMainCondition =
+      PLBuilder.CreateICmpEQ(RuntimeLoopCount, Zero, "skip_main");
 
   // Create the a comparison of src and dst, based on which we jump to either
   // the forward-copy part of the function (if src >= dst) or the backwards-copy
@@ -387,76 +455,374 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
   // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
   // structure. Its block terminators (unconditional branches) are replaced by
   // the appropriate conditional branches when the loop is built.
-  ICmpInst *PtrCompare = new ICmpInst(InsertBefore->getIterator(), ICmpInst::ICMP_ULT,
-                                      SrcAddr, DstAddr, "compare_src_dst");
+  // If the pointers are in different address spaces, they need to be converted
+  // to a compatible one. Cases where memory ranges in the different address
+  // spaces cannot overlap are lowered as memcpy and not handled here.
+  auto [CmpSrcAddr, CmpDstAddr] =
+      tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
+  Value *PtrCompare =
+      PLBuilder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_src_dst");
   Instruction *ThenTerm, *ElseTerm;
-  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(), &ThenTerm,
-                                &ElseTerm);
-
-  // Each part of the function consists of two blocks:
-  //   copy_backwards:        used to skip the loop when n == 0
-  //   copy_backwards_loop:   the actual backwards loop BB
-  //   copy_forward:          used to skip the loop when n == 0
-  //   copy_forward_loop:     the actual forward loop BB
+  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
+                                &ThenTerm, &ElseTerm);
+
+  // If the LoopOpSize is greater than 1, each part of the function consist of
+  // four blocks:
+  //   memmove_copy_backwards:
+  //       skip the residual loop when 0 iterations are required
+  //   memmove_bwd_residual_loop:
+  //       copy the last few bytes individually so that the remaining length is
+  //       a multiple of the LoopOpSize
+  //   memmove_bwd_middle: skip the main loop when 0 iterations are required
+  //   memmove_bwd_main_loop: the actual backwards loop BB with wide accesses
+  //   memmove_copy_forward: skip the main loop when 0 iterations are required
+  //   memmove_fwd_main_loop: the actual forward loop BB with wide accesses
+  //   memmove_fwd_middle: skip the residual loop when 0 iterations are required
+  //   memmove_fwd_residual_loop: copy the last few bytes individually
+  //
+  // The main and residual loop are switched between copying forward and
+  // backward so that the residual loop always operates on the end of the moved
+  // range. This is based on the assumption that buffers whose start is aligned
+  // with the LoopOpSize are more common than buffers whose end is.
+  //
+  // If the LoopOpSize is 1, each part of the function consists of two blocks:
+  //   memmove_copy_backwards: skip the loop when 0 iterations are required
+  //   memmove_bwd_main_loop: the actual backwards loop BB
+  //   memmove_copy_forward: skip the loop when 0 iterations are required
+  //   memmove_fwd_main_loop: the actual forward loop BB
   BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
-  CopyBackwardsBB->setName("copy_backwards");
+  CopyBackwardsBB->setName("memmove_copy_backwards");
   BasicBlock *CopyForwardBB = ElseTerm->getParent();
-  CopyForwardBB->setName("copy_forward");
+  CopyForwardBB->setName("memmove_copy_forward");
   BasicBlock *ExitBB = InsertBefore->getParent();
   ExitBB->setName("memmove_done");
 
-  unsigned PartSize = DL.getTypeStoreSize(EltTy);
-  Align PartSrcAlign(commonAlignment(SrcAlign, PartSize));
-  Align PartDstAlign(commonAlignment(DstAlign, PartSize));
+  Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
+  Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
 
-  // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
-  // between both backwards and forward copy clauses.
-  ICmpInst *CompareN =
-      new ICmpInst(OrigBB->getTerminator()->getIterator(), ICmpInst::ICMP_EQ, CopyLen,
-                   ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
+  // Accesses in the residual loops do not share the same alignment as those in
+  // the main loops.
+  Align ResidualSrcAlign(commonAlignment(PartSrcAlign, ResidualLoopOpSize));
+  Align ResidualDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));
 
   // Copying backwards.
-  BasicBlock *LoopBB =
-    BasicBlock::Create(F->getContext(), "copy_backwards_loop", F, CopyForwardBB);
-  IRBuilder<> LoopBuilder(LoopBB);
+  {
+    BasicBlock *MainLoopBB = BasicBlock::Create(
+        F->getContext(), "memmove_bwd_main_loop", F, CopyForwardBB);
+
+    // The predecessor of the memmove_bwd_main_loop. Updated in the
+    // following if a residual loop is emitted first.
+    BasicBlock *PredBB = CopyBackwardsBB;
+
+    if (RequiresResidual) {
+      // backwards residual loop
+      BasicBlock *ResidualLoopBB = BasicBlock::Create(
+          F->getContext(), "memmove_bwd_residual_loop", F, MainLoopBB);
+      IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
+      PHINode *ResidualLoopPhi = ResidualLoopBuilder.CreatePHI(ILengthType, 0);
+      Value *ResidualIndex = ResidualLoopBuilder.CreateSub(
+          ResidualLoopPhi, One, "bwd_residual_index");
+      Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(
+          ResidualLoopOpType, SrcAddr, ResidualIndex);
+      Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
+          ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
+          "element");
+      Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(
+          ResidualLoopOpType, DstAddr, ResidualIndex);
+      ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
+                                             ResidualDstAlign, DstIsVolatile);
+
+      // After the residual loop, go to an intermediate block.
+      BasicBlock *IntermediateBB = BasicBlock::Create(
+          F->getContext(), "memmove_bwd_middle", F, MainLoopBB);
+      // Later code expects a terminator in the PredBB.
+      IRBuilder<> IntermediateBuilder(IntermediateBB);
+      IntermediateBuilder.CreateUnreachable();
+      ResidualLoopBuilder.CreateCondBr(
+          ResidualLoopBuilder.CreateICmpEQ(ResidualIndex,
+                                           RuntimeBytesCopiedMainLoop),
+          IntermediateBB, ResidualLoopBB);
+
+      ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
+      ResidualLoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
+
+      // How to get to the residual:
+      BranchInst::Create(IntermediateBB, ResidualLoopBB, SkipResidualCondition,
+                         ThenTerm->getIterator());
+      ThenTerm->eraseFromParent();
+
+      PredBB = IntermediateBB;
+    }
 
-  PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
-  Value *IndexPtr = LoopBuilder.CreateSub(
-      LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
-  Value *Element = LoopBuilder.CreateAlignedLoad(
-      EltTy, LoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, IndexPtr),
-      PartSrcAlign, SrcIsVolatile, "element");
-  LoopBuilder.CreateAlignedStore(
-      Element, LoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, IndexPtr),
-      PartDstAlign, DstIsVolatile);
-  LoopBuilder.CreateCondBr(
-      LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
-      ExitBB, LoopBB);
-  LoopPhi->addIncoming(IndexPtr, LoopBB);
-  LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
-  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm->getIterator());
-  ThenTerm->eraseFromParent();
+    // main loop
+    IRBuilder<> MainLoopBuilder(MainLoopBB);
+    PHINode *MainLoopPhi = MainLoopBuilder.CreatePHI(ILengthType, 0);
+    Value *MainIndex =
+        MainLoopBuilder.CreateSub(MainLoopPhi, One, "bwd_main_index");
+    Value *LoadGEP =
+        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, MainIndex);
+    Value *Element = MainLoopBuilder.CreateAlignedLoad(
+        LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
+    Value *StoreGEP =
+        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, MainIndex);
+    MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
+                                       DstIsVolatile);
+    MainLoopBuilder.CreateCondBr(MainLoopBuilder.CreateICmpEQ(MainIndex, Zero),
+                                 ExitBB, MainLoopBB);
+    MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
+    MainLoopPhi->addIncoming(RuntimeLoopCount, PredBB);
+
+    // How to get to the main loop:
+    Instruction *PredBBTerm = PredBB->getTerminator();
+    BranchInst::Create(ExitBB, MainLoopBB, SkipMainCondition,
+                       PredBBTerm->getIterator());
+    PredBBTerm->eraseFromParent();
+  }
 
   // Copying forward.
-  BasicBlock *FwdLoopBB =
-    BasicBlock::Create(F->getContext(), "copy_forward_loop", F, ExitBB);
-  IRBuilder<> FwdLoopBuilder(FwdLoopBB);
-  PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
-  Value *SrcGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, FwdCopyPhi);
-  Value *FwdElement = FwdLoopBuilder.CreateAlignedLoad(
-      EltTy, SrcGEP, PartSrcAlign, SrcIsVolatile, "element");
-  Value *DstGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, FwdCopyPhi);
-  FwdLoopBuilder.CreateAlignedStore(FwdElement, DstGEP, PartDstAlign,
-                                    DstIsVolatile);
-  Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
-      FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
-  FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
-                              ExitBB, FwdLoopBB);
-  FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
-  FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
-
-  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm->getIterator());
-  ElseTerm->eraseFromParent();
+  // main loop
+  {
+    BasicBlock *MainLoopBB =
+        BasicBlock::Create(F->getContext(), "memmove_fwd_main_loop", F, ExitBB);
+    IRBuilder<> MainLoopBuilder(MainLoopBB);
+    PHINode *MainLoopPhi =
+        MainLoopBuilder.CreatePHI(ILengthType, 0, "fwd_main_index");
+    Value *LoadGEP =
+        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, MainLoopPhi);
+    Value *Element = MainLoopBuilder.CreateAlignedLoad(
+        LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
+    Value *StoreGEP =
+        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, MainLoopPhi);
+    MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
+                                       DstIsVolatile);
+    Value *MainIndex = MainLoopBuilder.CreateAdd(MainLoopPhi, One);
+    MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
+    MainLoopPhi->addIncoming(Zero, CopyForwardBB);
+
+    Instruction *CopyFwdBBTerm = CopyForwardBB->getTerminator();
+    BasicBlock *SuccessorBB = ExitBB;
+    if (RequiresResidual)
+      SuccessorBB =
+          BasicBlock::Create(F->getContext(), "memmove_fwd_middle", F, ExitBB);
+
+    // leaving or staying in the main loop
+    MainLoopBuilder.CreateCondBr(
+        MainLoopBuilder.CreateICmpEQ(MainIndex, RuntimeLoopCount), SuccessorBB,
+        MainLoopBB);
+
+    // getting in or skipping the main loop
+    BranchInst::Create(SuccessorBB, MainLoopBB, SkipMainCondition,
+                       CopyFwdBBTerm->getIterator());
+    CopyFwdBBTerm->eraseFromParent();
+
+    if (RequiresResidual) {
+      BasicBlock *IntermediateBB = SuccessorBB;
+      IRBuilder<> IntermediateBuilder(IntermediateBB);
+      BasicBlock *ResidualLoopBB = BasicBlock::Create(
+          F->getContext(), "memmove_fwd_residual_loop", F, ExitBB);
+      IntermediateBuilder.CreateCondBr(SkipResidualCondition, ExitBB,
+                                       ResidualLoopBB);
+
+      // Residual loop
+      IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
+      PHINode *ResidualLoopPhi =
+          ResidualLoopBuilder.CreatePHI(ILengthType, 0, "fwd_residual_index");
+      Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(
+          ResidualLoopOpType, SrcAddr, ResidualLoopPhi);
+      Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
+          ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
+          "element");
+      Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(
+          ResidualLoopOpType, DstAddr, ResidualLoopPhi);
+      ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
+                                             ResidualDstAlign, DstIsVolatile);
+      Value *ResidualIndex =
+          ResidualLoopBuilder.CreateAdd(ResidualLoopPhi, One);
+      ResidualLoopBuilder.CreateCondBr(
+          ResidualLoopBuilder.CreateICmpEQ(ResidualIndex, CopyLen), ExitBB,
+          ResidualLoopBB);
+      ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
+      ResidualLoopPhi->addIncoming(RuntimeBytesCopiedMainLoop, IntermediateBB);
+    }
+  }
+}
+
+// Similar to createMemMoveLoopUnknownSize, only the trip counts are computed at
+// compile time, obsolete loops and branches are omitted, and the residual code
+// is straight-line code instead of a loop.
+static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
+                                       Value *SrcAddr, Value *DstAddr,
+                                       ConstantInt *CopyLen, Align SrcAlign,
+                                       Align DstAlign, bool SrcIsVolatile,
+                                       bool DstIsVolatile,
+                                       const TargetTransformInfo &TTI) {
+  // No need to expand zero length moves.
+  if (CopyLen->isZero())
+    return;
+
+  Type *TypeOfCopyLen = CopyLen->getType();
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+  const DataLayout &DL = F->getDataLayout();
+  LLVMContext &Ctx = OrigBB->getContext();
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
+  unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+
+  // Calculate the loop trip count and remaining bytes to copy after the loop.
+  uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
+  uint64_t BytesCopiedInLoop = LoopEndCount * LoopOpSize;
+  uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopiedInLoop;
+
+  IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
+  ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
+  ConstantInt *One = ConstantInt::get(ILengthType, 1);
+  ConstantInt *TripCount = ConstantInt::get(ILengthType, LoopEndCount);
+
+  IRBuilder<> PLBuilder(InsertBefore);
+
+  auto [CmpSrcAddr, CmpDstAddr] =
+      tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
+  Value *PtrCompare =
+      PLBuilder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_src_dst");
+  Instruction *ThenTerm, *ElseTerm;
+  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
+                                &ThenTerm, &ElseTerm);
+
+  BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
+  BasicBlock *CopyForwardBB = ElseTerm->getParent();
+  BasicBlock *ExitBB = InsertBefore->getParent();
+  ExitBB->setName("memmove_done");
+
+  Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
+  Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
+
+  // Helper function to generate a load/store pair of a given type in the
+  // residual. Used in the forward and backward branches.
+  auto GenerateResidualLdStPair = [&](Type *OpTy, IRBuilderBase &Builder,
+                                      uint64_t &BytesCopied) {
+    Align ResSrcAlign(commonAlignment(SrcAlign, BytesCopied));
+    Align ResDstAlign(commonAlignment(DstAlign, BytesCopied));
+
+    // Calculate the new index
+    unsigned OperandSize = DL.getTypeStoreSize(OpTy);
+
+    uint64_t GepIndex = BytesCopied / OperandSize;
+    assert(GepIndex * OperandSize == BytesCopied &&
+           "Division should have no Remainder!");
+
+    Value *SrcGEP = Builder.CreateInBoundsGEP(
+        OpTy, SrcAddr, ConstantInt::get(TypeOfCopyLen, GepIndex));
+    LoadInst *Load =
+        Builder.CreateAlignedLoad(OpTy, SrcGEP, ResSrcAlign, SrcIsVolatile);
+    Value *DstGEP = Builder.CreateInBoundsGEP(
+        OpTy, DstAddr, ConstantInt::get(TypeOfCopyLen, GepIndex));
+    Builder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile);
+    BytesCopied += OperandSize;
+  };
+
+  // Copying backwards.
+  if (RemainingBytes != 0) {
+    CopyBackwardsBB->setName("memmove_bwd_residual");
+    uint64_t BytesCopied = BytesCopiedInLoop;
+
+    // Residual code is required to move the remaining bytes. We need the same
+    // instructions as in the forward case, only in reverse. So we generate code
+    // the same way, except that we change the IRBuilder insert point for each
+    // load/store pair so that each one is inserted before the previous one
+    // instead of after it.
+    IRBuilder<> BwdResBuilder(CopyBackwardsBB->getFirstNonPHI());
+    SmallVector<Type *, 5> RemainingOps;
+    TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+                                          SrcAS, DstAS, PartSrcAlign.value(),
+                                          PartDstAlign.value());
+    for (auto *OpTy : RemainingOps) {
+      // reverse the order of the emitted operations
+      BwdResBuilder.SetInsertPoint(CopyBackwardsBB->getFirstNonPHI());
+      GenerateResidualLdStPair(OpTy, BwdResBuilder, BytesCopied);
+    }
+  }
+  if (LoopEndCount != 0) {
+    BasicBlock *LoopBB = CopyBackwardsBB;
+    BasicBlock *PredBB = OrigBB;
+    if (RemainingBytes != 0) {
+      // if we introduce residual code, it needs its separate BB
+      LoopBB = CopyBackwardsBB->splitBasicBlock(
+          CopyBackwardsBB->getTerminator(), "memmove_bwd_loop");
+      PredBB = CopyBackwardsBB;
+    } else {
+      CopyBackwardsBB->setName("memmove_bwd_loop");
+    }
+    IRBuilder<> LoopBuilder(LoopBB->getTerminator());
+    PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0);
+    Value *Index = LoopBuilder.CreateSub(LoopPhi, One, "bwd_index");
+    Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, Index);
+    Value *Element = LoopBuilder.CreateAlignedLoad(
+        LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
+    Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, Index);
+    LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
+                                   DstIsVolatile);
+
+    // Replace the unconditional branch introduced by
+    // SplitBlockAndInsertIfThenElse to turn LoopBB into a loop.
+    Instruction *UncondTerm = LoopBB->getTerminator();
+    LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, Zero), ExitBB,
+                             LoopBB);
+    UncondTerm->eraseFromParent();
+
+    LoopPhi->addIncoming(Index, LoopBB);
+    LoopPhi->addIncoming(TripCount, PredBB);
+  }
+
+  // Copying forward.
+  BasicBlock *FwdResidualBB = CopyForwardBB;
+  if (LoopEndCount != 0) {
+    CopyForwardBB->setName("memmove_fwd_loop");
+    BasicBlock *LoopBB = CopyForwardBB;
+    BasicBlock *SuccBB = ExitBB;
+    if (RemainingBytes != 0) {
+      // if we introduce residual code, it needs its separate BB
+      SuccBB = CopyForwardBB->splitBasicBlock(CopyForwardBB->getTerminator(),
+                                              "memmove_fwd_residual");
+      FwdResidualBB = SuccBB;
+    }
+    IRBuilder<> LoopBuilder(LoopBB->getTerminator());
+    PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0, "fwd_index");
+    Value *LoadGEP =
+        LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopPhi);
+    Value *Element = LoopBuilder.CreateAlignedLoad(
+        LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
+    Value *StoreGEP =
+        LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopPhi);
+    LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
+                                   DstIsVolatile);
+    Value *Index = LoopBuilder.CreateAdd(LoopPhi, One);
+    LoopPhi->addIncoming(Index, LoopBB);
+    LoopPhi->addIncoming(Zero, OrigBB);
+
+    // Replace the unconditional branch to turn LoopBB into a loop.
+    Instruction *UncondTerm = LoopBB->getTerminator();
+    LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, TripCount), SuccBB,
+                             LoopBB);
+    UncondTerm->eraseFromParent();
+  }
+
+  if (RemainingBytes != 0) {
+    uint64_t BytesCopied = BytesCopiedInLoop;
+
+    // Residual code is required to move the remaining bytes. In the forward
+    // case, we emit it in the normal order.
+    IRBuilder<> FwdResBuilder(FwdResidualBB->getTerminator());
+    SmallVector<Type *, 5> RemainingOps;
+    TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+                                          SrcAS, DstAS, PartSrcAlign.value(),
+                                          PartDstAlign.value());
+    for (auto *OpTy : RemainingOps)
+      GenerateResidualLdStPair(OpTy, FwdResBuilder, BytesCopied);
+  }
 }
 
 static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
@@ -572,11 +938,8 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
       return true;
     }
 
-    if (TTI.isValidAddrSpaceCast(DstAS, SrcAS))
-      DstAddr = CastBuilder.CreateAddrSpaceCast(DstAddr, SrcAddr->getType());
-    else if (TTI.isValidAddrSpaceCast(SrcAS, DstAS))
-      SrcAddr = CastBuilder.CreateAddrSpaceCast(SrcAddr, DstAddr->getType());
-    else {
+    if (!(TTI.isValidAddrSpaceCast(DstAS, SrcAS) ||
+          TTI.isValidAddrSpaceCast(SrcAS, DstAS))) {
       // We don't know generically if it's legal to introduce an
       // addrspacecast. We need to know either if it's legal to insert an
       // addrspacecast, or if the address spaces cannot alias.
@@ -587,9 +950,15 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
     }
   }
 
-  createMemMoveLoop(
-      /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign,
-      SrcIsVolatile, DstIsVolatile, TTI);
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(CopyLen)) {
+    createMemMoveLoopKnownSize(
+        /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CI, SrcAlign, DstAlign,
+        SrcIsVolatile, DstIsVolatile, TTI);
+  } else {
+    createMemMoveLoopUnknownSize(
+        /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign,
+        SrcIsVolatile, DstIsVolatile, TTI);
+  }
   return true;
 }
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index f23e288..1a17524 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3246,7 +3246,12 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI,
 }
 
 /// Return true if we can thread a branch across this block.
-static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
+static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB,
+                                               const TargetTransformInfo &TTI) {
+  // Skip threading if the branch may be divergent.
+  if (TTI.hasBranchDivergence(BB->getParent()))
+    return false;
+
   int Size = 0;
   EphemeralValueTracker EphTracker;
 
@@ -3301,10 +3306,9 @@ static ConstantInt *getKnownValueOnEdge(Value *V, BasicBlock *From,
 /// If we have a conditional branch on something for which we know the constant
 /// value in predecessors (e.g. a phi node in the current block), thread edges
 /// from the predecessor to their ultimate destination.
-static std::optional<bool>
-FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
-                                            const DataLayout &DL,
-                                            AssumptionCache *AC) {
+static std::optional<bool> FoldCondBranchOnValueKnownInPredecessorImpl(
+    BranchInst *BI, DomTreeUpdater *DTU, const DataLayout &DL,
+    const TargetTransformInfo &TTI, AssumptionCache *AC) {
   SmallMapVector<ConstantInt *, SmallSetVector<BasicBlock *, 2>, 2> KnownValues;
   BasicBlock *BB = BI->getParent();
   Value *Cond = BI->getCondition();
@@ -3332,7 +3336,7 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
   // Now we know that this block has multiple preds and two succs.
   // Check that the block is small enough and values defined in the block are
   // not used outside of it.
-  if (!BlockIsSimpleEnoughToThreadThrough(BB))
+  if (!BlockIsSimpleEnoughToThreadThrough(BB, TTI))
     return false;
 
   for (const auto &Pair : KnownValues) {
@@ -3459,15 +3463,14 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
   return false;
 }
 
-static bool FoldCondBranchOnValueKnownInPredecessor(BranchInst *BI,
-                                                    DomTreeUpdater *DTU,
-                                                    const DataLayout &DL,
-                                                    AssumptionCache *AC) {
+static bool FoldCondBranchOnValueKnownInPredecessor(
+    BranchInst *BI, DomTreeUpdater *DTU, const DataLayout &DL,
+    const TargetTransformInfo &TTI, AssumptionCache *AC) {
   std::optional<bool> Result;
   bool EverChanged = false;
   do {
     // Note that None means "we changed things, but recurse further."
-    Result = FoldCondBranchOnValueKnownInPredecessorImpl(BI, DTU, DL, AC);
+    Result = FoldCondBranchOnValueKnownInPredecessorImpl(BI, DTU, DL, TTI, AC);
     EverChanged |= Result == std::nullopt || *Result;
   } while (Result == std::nullopt);
   return EverChanged;
@@ -7543,7 +7546,7 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // If this is a branch on something for which we know the constant value in
   // predecessors (e.g. a phi node in the current block), thread control
   // through this block.
-  if (FoldCondBranchOnValueKnownInPredecessor(BI, DTU, DL, Options.AC))
+  if (FoldCondBranchOnValueKnownInPredecessor(BI, DTU, DL, TTI, Options.AC))
     return requestResimplify();
 
   // Scan predecessor blocks for conditional branches.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index c63cf0c..590f315 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -365,8 +365,8 @@ public:
   /// Return the best VPlan for \p VF.
   VPlan &getBestPlanFor(ElementCount VF) const;
 
-  /// Return the most profitable plan and fix its VF to the most profitable one.
-  VPlan &getBestPlan() const;
+  /// Return the most profitable vectorization factor.
+  ElementCount getBestVF() const;
 
   /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
   /// according to the best selected \p VF and  \p UF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 09ca859..9733ac0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7162,13 +7162,12 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
   return Cost;
 }
 
-VPlan &LoopVectorizationPlanner::getBestPlan() const {
+ElementCount LoopVectorizationPlanner::getBestVF() const {
   // If there is a single VPlan with a single VF, return it directly.
   VPlan &FirstPlan = *VPlans[0];
   if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
-    return FirstPlan;
+    return *FirstPlan.vectorFactors().begin();
 
-  VPlan *BestPlan = &FirstPlan;
   ElementCount ScalarVF = ElementCount::getFixed(1);
   assert(hasPlanWithVF(ScalarVF) &&
          "More than a single plan/VF w/o any plan having scalar VF");
@@ -7199,14 +7198,11 @@ VPlan &LoopVectorizationPlanner::getBestPlan() const {
 
       InstructionCost Cost = cost(*P, VF);
       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
-      if (isMoreProfitable(CurrentFactor, BestFactor)) {
+      if (isMoreProfitable(CurrentFactor, BestFactor))
         BestFactor = CurrentFactor;
-        BestPlan = &*P;
-      }
     }
   }
-  BestPlan->setVF(BestFactor.Width);
-  return *BestPlan;
+  return BestFactor.Width;
 }
 
 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
@@ -10001,10 +9997,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
                                  &CM, BFI, PSI, Checks);
 
-      VPlan &BestPlan = LVP.getBestPlan();
-      assert(BestPlan.hasScalarVFOnly() &&
+      ElementCount BestVF = LVP.getBestVF();
+      assert(BestVF.isScalar() &&
              "VPlan cost model and legacy cost model disagreed");
-      LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
+      VPlan &BestPlan = LVP.getBestPlanFor(BestVF);
+      LVP.executePlan(BestVF, IC, BestPlan, Unroller, DT, false);
 
       ORE->emit([&]() {
         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
@@ -10015,20 +10012,25 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     } else {
       // If we decided that it is *legal* to vectorize the loop, then do it.
 
+      ElementCount BestVF = LVP.getBestVF();
+      LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << BestVF << "\n");
+      assert(VF.Width == BestVF &&
+             "VPlan cost model and legacy cost model disagreed");
+      VPlan &BestPlan = LVP.getBestPlanFor(BestVF);
       // Consider vectorizing the epilogue too if it's profitable.
       VectorizationFactor EpilogueVF =
-          LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
+          LVP.selectEpilogueVectorizationFactor(BestVF, IC);
       if (EpilogueVF.Width.isVector()) {
 
         // The first pass vectorizes the main loop and creates a scalar epilogue
         // to be vectorized by executing the plan (potentially with a different
         // factor) again shortly afterwards.
-        EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
+        EpilogueLoopVectorizationInfo EPI(BestVF, IC, EpilogueVF.Width, 1);
         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
                                            EPI, &LVL, &CM, BFI, PSI, Checks);
 
-        std::unique_ptr<VPlan> BestMainPlan(
-            LVP.getBestPlanFor(EPI.MainLoopVF).duplicate());
+        assert(EPI.MainLoopVF == VF.Width && "VFs must match");
+        std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
         const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
             EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
         ++LoopsVectorized;
@@ -10119,18 +10121,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         if (!MainILV.areSafetyChecksAdded())
           DisableRuntimeUnroll = true;
       } else {
-        VPlan &BestPlan = LVP.getBestPlan();
-        assert(size(BestPlan.vectorFactors()) == 1 &&
-               "Plan should have a single VF");
-        ElementCount Width = *BestPlan.vectorFactors().begin();
-        LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
-                          << "\n");
-        assert(VF.Width == Width &&
-               "VPlan cost model and legacy cost model disagreed");
-        InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, Width,
+        InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, BestVF,
                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
                                PSI, Checks);
-        LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
+        LVP.executePlan(BestVF, IC, BestPlan, LB, DT, false);
         ++LoopsVectorized;
 
         // Add metadata to disable runtime unrolling a scalar loop when there
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b74417f..3bdd8fd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8363,6 +8363,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                   : TTI.getStridedMemoryOpCost(
                         Instruction::Load, LoadTy, LI->getPointerOperand(),
                         /*VariableMask=*/false, Alignment, CostKind, LI);
+          // Add external uses costs.
+          for (auto [Idx, V] : enumerate(VL.slice(
+                   P.first, std::min<unsigned>(VL.size() - P.first, VF))))
+            if (!R.areAllUsersVectorized(cast<Instruction>(V)))
+              GatherCost += TTI.getVectorInstrCost(Instruction::ExtractElement,
+                                                   LoadTy, CostKind, Idx);
           // Estimate GEP cost.
           SmallVector<Value *> PointerOps(VF);
           for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e1d18d5..c9da5e5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2246,12 +2246,12 @@ public:
 /// The Operands are {ChainOp, VecOp, EVL, [Condition]}.
 class VPReductionEVLRecipe : public VPReductionRecipe {
 public:
-  VPReductionEVLRecipe(VPReductionRecipe *R, VPValue *EVL, VPValue *CondOp)
+  VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp)
       : VPReductionRecipe(
-            VPDef::VPReductionEVLSC, R->getRecurrenceDescriptor(),
-            cast_or_null<Instruction>(R->getUnderlyingValue()),
-            ArrayRef<VPValue *>({R->getChainOp(), R->getVecOp(), EVL}), CondOp,
-            R->isOrdered()) {}
+            VPDef::VPReductionEVLSC, R.getRecurrenceDescriptor(),
+            cast_or_null<Instruction>(R.getUnderlyingValue()),
+            ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp,
+            R.isOrdered()) {}
 
   ~VPReductionEVLRecipe() override = default;
 
@@ -2558,10 +2558,10 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
 /// using the address to load from, the explicit vector length and an optional
 /// mask.
 struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
-  VPWidenLoadEVLRecipe(VPWidenLoadRecipe *L, VPValue *EVL, VPValue *Mask)
-      : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L->getIngredient(),
-                            {L->getAddr(), EVL}, L->isConsecutive(),
-                            L->isReverse(), L->getDebugLoc()),
+  VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask)
+      : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
+                            {L.getAddr(), &EVL}, L.isConsecutive(),
+                            L.isReverse(), L.getDebugLoc()),
         VPValue(this, &getIngredient()) {
     setMask(Mask);
   }
@@ -2634,11 +2634,10 @@ struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
 /// using the value to store, the address to store to, the explicit vector
 /// length and an optional mask.
 struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
-  VPWidenStoreEVLRecipe(VPWidenStoreRecipe *S, VPValue *EVL, VPValue *Mask)
-      : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S->getIngredient(),
-                            {S->getAddr(), S->getStoredValue(), EVL},
-                            S->isConsecutive(), S->isReverse(),
-                            S->getDebugLoc()) {
+  VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask)
+      : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
+                            {S.getAddr(), S.getStoredValue(), &EVL},
+                            S.isConsecutive(), S.isReverse(), S.getDebugLoc()) {
     setMask(Mask);
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c91fd0f..045f6c3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1488,13 +1488,13 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
       if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(CurRecipe)) {
         VPValue *NewMask = GetNewMask(MemR->getMask());
         if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR))
-          NewRecipe = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask);
+          NewRecipe = new VPWidenLoadEVLRecipe(*L, *VPEVL, NewMask);
         else if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR))
-          NewRecipe = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask);
+          NewRecipe = new VPWidenStoreEVLRecipe(*S, *VPEVL, NewMask);
         else
           llvm_unreachable("unsupported recipe");
       } else if (auto *RedR = dyn_cast<VPReductionRecipe>(CurRecipe)) {
-        NewRecipe = new VPReductionEVLRecipe(RedR, VPEVL,
+        NewRecipe = new VPReductionEVLRecipe(*RedR, *VPEVL,
                                              GetNewMask(RedR->getCondOp()));
       }
 
diff --git a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
index c5da46a..963bb8a 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
@@ -990,192 +990,192 @@ declare {<64 x i8>, <64 x i1>}  @llvm.smul.with.overflow.v64i8(<64 x i8>, <64 x
 define i32 @smul(i32 %arg) {
 ; SSSE3-LABEL: 'smul'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'smul'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'smul'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'smul'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'smul'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'smul'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'smul'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'smul'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'smul'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'smul'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
@@ -1228,192 +1228,192 @@ declare {<64 x i8>, <64 x i1>}  @llvm.umul.with.overflow.v64i8(<64 x i8>, <64 x
 define i32 @umul(i32 %arg) {
 ; SSSE3-LABEL: 'umul'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'umul'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'umul'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'umul'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'umul'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'umul'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'umul'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'umul'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'umul'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'umul'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index bbe3396..88a22f9 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -51,22 +51,22 @@ declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)
 define void @umul(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
 ; THRU-LABEL: 'umul'
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'umul'
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'umul'
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'umul'
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %s = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/load-store-index-loaded-in-loop.ll b/llvm/test/Analysis/LoopAccessAnalysis/load-store-index-loaded-in-loop.ll
index 2e61a28..6d8e296 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/load-store-index-loaded-in-loop.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/load-store-index-loaded-in-loop.ll
@@ -9,21 +9,19 @@
 define void @B_indices_loaded_in_loop_A_stored(ptr %A, ptr noalias %B, i64 %N, i64 %off) {
 ; CHECK-LABEL: 'B_indices_loaded_in_loop_A_stored'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %l = load i32, ptr %gep.B, align 4 ->
+; CHECK-NEXT:            store i32 %inc, ptr %gep.B, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %indices = load i8, ptr %gep.A, align 1 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep.C, align 4
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
-; CHECK-NEXT:      Check 0:
-; CHECK-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
-; CHECK-NEXT:          %gep.C = getelementptr inbounds i32, ptr %A, i64 %iv
-; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
-; CHECK-NEXT:          %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv.off
 ; CHECK-NEXT:      Grouped accesses:
-; CHECK-NEXT:        Group [[GRP1]]:
-; CHECK-NEXT:          (Low: %A High: ((4 * %N) + %A))
-; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop>
-; CHECK-NEXT:        Group [[GRP2]]:
-; CHECK-NEXT:          (Low: (%off + %A) High: (%N + %off + %A))
-; CHECK-NEXT:            Member: {(%off + %A),+,1}<nw><%loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
@@ -59,9 +57,9 @@ define void @B_indices_loaded_in_loop_A_not_stored(ptr %A, ptr noalias %B, i64 %
 ; CHECK-LABEL: 'B_indices_loaded_in_loop_A_not_stored'
 ; CHECK-NEXT:    loop:
 ; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:        IndirectUnsafe:
 ; CHECK-NEXT:            %l = load i32, ptr %gep.B, align 4 ->
 ; CHECK-NEXT:            store i32 %inc, ptr %gep.B, align 4
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll b/llvm/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll
index 546a75c..28ee6c6 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll
@@ -13,9 +13,9 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK-NEXT: for.body:
 ; CHECK-NEXT:   Report: unsafe dependent memory operations in loop
 ; CHECK-NOT:    Report: cannot identify array bounds
-; CHECK-NEXT:   Unknown data dependence.
+; CHECK-NEXT:   Unsafe indirect dependence.
 ; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       Unknown:
+; CHECK-NEXT:       IndirectUnsafe:
 ; CHECK-NEXT:         %loadA = load i16, ptr %arrayidxA, align 2 ->
 ; CHECK-NEXT:         store i16 %mul, ptr %arrayidxA, align 2
 
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/print-order.ll b/llvm/test/Analysis/LoopAccessAnalysis/print-order.ll
index 18e45f4..8ca3038 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/print-order.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/print-order.ll
@@ -9,8 +9,9 @@
 ; CHECK-LABEL: 'negative_step'
 ; CHECK: LAA: Found an analyzable loop: loop
 ; CHECK: LAA: Checking memory dependencies
-; CHECK-NEXT: LAA: Src Scev: {(4092 + %A),+,-4}<nw><%loop>Sink Scev: {(4088 + %A)<nuw>,+,-4}<nw><%loop>(Induction step: -1)
+; CHECK-NEXT: LAA: Src Scev: {(4092 + %A),+,-4}<nw><%loop>Sink Scev: {(4088 + %A)<nuw>,+,-4}<nw><%loop>
 ; CHECK-NEXT: LAA: Distance for   store i32 %add, ptr %gep.A.plus.1, align 4 to   %l = load i32, ptr %gep.A, align 4: -4
+; CHECK-NEXT: LAA: Src induction step: -1 Sink induction step: -1
 ; CHECK-NEXT: LAA: Dependence is negative
 
 define void @negative_step(ptr nocapture %A) {
@@ -41,8 +42,9 @@ exit:
 ; CHECK-LABEL: 'positive_step'
 ; CHECK: LAA: Found an analyzable loop: loop
 ; CHECK: LAA: Checking memory dependencies
-; CHECK-NEXT: LAA: Src Scev: {(4 + %A)<nuw>,+,4}<nuw><%loop>Sink Scev: {%A,+,4}<nw><%loop>(Induction step: 1)
+; CHECK-NEXT: LAA: Src Scev: {(4 + %A)<nuw>,+,4}<nuw><%loop>Sink Scev: {%A,+,4}<nw><%loop>
 ; CHECK-NEXT: LAA: Distance for   %l = load i32, ptr %gep.A, align 4 to   store i32 %add, ptr %gep.A.minus.1, align 4: -4
+; CHECK-NEXT: LAA: Src induction step: 1 Sink induction step: 1
 ; CHECK-NEXT: LAA: Dependence is negative
 
 define void @positive_step(ptr nocapture %A) {
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll b/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
index 60fe8b4..8bef7583 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
@@ -5,9 +5,9 @@ define void @test(ptr noalias %x, ptr noalias %y, ptr noalias %z) {
 ; CHECK-LABEL: 'test'
 ; CHECK-NEXT:    loop:
 ; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:        IndirectUnsafe:
 ; CHECK-NEXT:            %load = load double, ptr %gep.sel, align 8 ->
 ; CHECK-NEXT:            store double %load, ptr %gep.sel2, align 8
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
index 7c1b11e..f0aed24 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
@@ -276,9 +276,9 @@ define void @single_stride_used_for_trip_count(ptr noalias %A, ptr noalias %B, i
 ; CHECK-LABEL: 'single_stride_used_for_trip_count'
 ; CHECK-NEXT:    loop:
 ; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:        IndirectUnsafe:
 ; CHECK-NEXT:            %load = load i32, ptr %gep.A, align 4 ->
 ; CHECK-NEXT:            store i32 %add, ptr %gep.A.next, align 4
 ; CHECK-EMPTY:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
index 4d4da86..de9af52 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
@@ -9,56 +9,71 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src
 ; LOOP:       ; %bb.0:
 ; LOOP-NEXT:    v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1]
 ; LOOP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; LOOP-NEXT:    s_xor_b64 s[4:5], exec, s[0:1]
-; LOOP-NEXT:    s_cbranch_execz .LBB0_3
-; LOOP-NEXT:  ; %bb.1: ; %copy_forward
-; LOOP-NEXT:    s_mov_b64 s[6:7], 0
+; LOOP-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; LOOP-NEXT:    s_cbranch_execnz .LBB0_3
+; LOOP-NEXT:  ; %bb.1: ; %Flow
+; LOOP-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; LOOP-NEXT:    s_cbranch_execnz .LBB0_4
+; LOOP-NEXT:  .LBB0_2: ; %memmove_done
+; LOOP-NEXT:    s_endpgm
+; LOOP-NEXT:  .LBB0_3:
+; LOOP-NEXT:    s_mov_b32 s6, 0
+; LOOP-NEXT:    s_mov_b32 s7, 0xf000
+; LOOP-NEXT:    s_mov_b64 s[4:5], 0
+; LOOP-NEXT:    buffer_load_ubyte v4, v[2:3], s[4:7], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_load_ubyte v5, v[2:3], s[4:7], 0 addr64 offset:3
+; LOOP-NEXT:    buffer_load_ubyte v6, v[2:3], s[4:7], 0 addr64 offset:2
+; LOOP-NEXT:    buffer_load_ubyte v2, v[2:3], s[4:7], 0 addr64
+; LOOP-NEXT:    s_waitcnt vmcnt(3)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
+; LOOP-NEXT:    s_waitcnt vmcnt(2)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
+; LOOP-NEXT:    s_waitcnt vmcnt(1)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; LOOP-NEXT:    s_waitcnt vmcnt(0)
+; LOOP-NEXT:    v_or_b32_e32 v2, v3, v2
+; LOOP-NEXT:    v_or_b32_e32 v3, v4, v5
+; LOOP-NEXT:    v_or_b32_e32 v2, v3, v2
+; LOOP-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; LOOP-NEXT:    v_bfe_u32 v4, v2, 8, 8
+; LOOP-NEXT:    buffer_store_byte v2, v[0:1], s[4:7], 0 addr64
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[4:7], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_store_byte v3, v[0:1], s[4:7], 0 addr64 offset:2
+; LOOP-NEXT:    buffer_store_byte v2, v[0:1], s[4:7], 0 addr64 offset:3
+; LOOP-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; LOOP-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; LOOP-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; LOOP-NEXT:    s_cbranch_execz .LBB0_2
+; LOOP-NEXT:  .LBB0_4: ; %memmove_bwd_residual
 ; LOOP-NEXT:    s_mov_b32 s2, 0
 ; LOOP-NEXT:    s_mov_b32 s3, 0xf000
 ; LOOP-NEXT:    s_mov_b64 s[0:1], 0
-; LOOP-NEXT:    v_mov_b32_e32 v4, s6
-; LOOP-NEXT:    v_mov_b32_e32 v5, s7
-; LOOP-NEXT:  .LBB0_2: ; %copy_forward_loop
-; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; LOOP-NEXT:    v_add_i32_e32 v6, vcc, v2, v4
-; LOOP-NEXT:    v_addc_u32_e32 v7, vcc, v3, v5, vcc
+; LOOP-NEXT:    s_waitcnt expcnt(2)
+; LOOP-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT:    buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:2
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    buffer_load_ubyte v8, v[6:7], s[0:3], 0 addr64
-; LOOP-NEXT:    v_add_i32_e32 v6, vcc, v0, v4
-; LOOP-NEXT:    v_addc_u32_e32 v7, vcc, v1, v5, vcc
-; LOOP-NEXT:    v_add_i32_e32 v4, vcc, 1, v4
-; LOOP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; LOOP-NEXT:    v_cmp_ne_u32_e32 vcc, 4, v4
+; LOOP-NEXT:    buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64
+; LOOP-NEXT:    s_waitcnt vmcnt(3)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
+; LOOP-NEXT:    s_waitcnt vmcnt(2)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
+; LOOP-NEXT:    s_waitcnt vmcnt(1)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; LOOP-NEXT:    s_waitcnt vmcnt(0)
-; LOOP-NEXT:    buffer_store_byte v8, v[6:7], s[0:3], 0 addr64
-; LOOP-NEXT:    s_cbranch_vccnz .LBB0_2
-; LOOP-NEXT:  .LBB0_3: ; %Flow17
-; LOOP-NEXT:    s_andn2_saveexec_b64 s[0:1], s[4:5]
-; LOOP-NEXT:    s_cbranch_execz .LBB0_6
-; LOOP-NEXT:  ; %bb.4: ; %copy_backwards
-; LOOP-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
-; LOOP-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; LOOP-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
-; LOOP-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; LOOP-NEXT:    s_mov_b32 s0, -4
-; LOOP-NEXT:    s_mov_b32 s6, 0
-; LOOP-NEXT:    s_mov_b32 s7, 0xf000
-; LOOP-NEXT:    s_mov_b64 s[4:5], 0
-; LOOP-NEXT:    v_mov_b32_e32 v4, s0
-; LOOP-NEXT:  .LBB0_5: ; %copy_backwards_loop
-; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; LOOP-NEXT:    v_or_b32_e32 v2, v3, v2
+; LOOP-NEXT:    v_or_b32_e32 v3, v4, v5
+; LOOP-NEXT:    v_or_b32_e32 v2, v3, v2
+; LOOP-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; LOOP-NEXT:    v_bfe_u32 v4, v2, 8, 8
+; LOOP-NEXT:    buffer_store_byte v2, v[0:1], s[0:3], 0 addr64
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    buffer_load_ubyte v5, v[2:3], s[4:7], 0 addr64
-; LOOP-NEXT:    v_add_i32_e32 v4, vcc, 1, v4
-; LOOP-NEXT:    s_and_b64 vcc, vcc, exec
-; LOOP-NEXT:    s_waitcnt vmcnt(0)
-; LOOP-NEXT:    buffer_store_byte v5, v[0:1], s[4:7], 0 addr64
-; LOOP-NEXT:    v_add_i32_e64 v0, s[0:1], -1, v0
-; LOOP-NEXT:    v_addc_u32_e64 v1, s[0:1], -1, v1, s[0:1]
-; LOOP-NEXT:    v_add_i32_e64 v2, s[0:1], -1, v2
-; LOOP-NEXT:    v_addc_u32_e64 v3, s[0:1], -1, v3, s[0:1]
-; LOOP-NEXT:    s_cbranch_vccz .LBB0_5
-; LOOP-NEXT:  .LBB0_6: ; %memmove_done
+; LOOP-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:2
+; LOOP-NEXT:    buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:3
 ; LOOP-NEXT:    s_endpgm
 ;
 ; UNROLL-LABEL: memmove_p1i8:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
index 91abbff..74a7ca2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
@@ -1,9 +1,9 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=kaveri -filetype=obj | llvm-readobj -S --sd --syms - | FileCheck --check-prefix=ELF %s
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdpal -mcpu=kaveri | llvm-readobj -S --sd --syms - | FileCheck %s --check-prefix=ELF
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
 
 ; ELF: Section {
 ; ELF: Name: .text
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 6ec6f6a..17fe3ad 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index da88b67..16f3ff4 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2,21 +2,21 @@
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_DPP %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index eb05613..c729618 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -2,17 +2,17 @@
 ; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 2970495..f67fcd6 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -2,10 +2,10 @@
 ; RUN: llc -mtriple=amdgcn-- - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s
 ; RUN: llc  -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 
 declare i1 @llvm.amdgcn.wqm.vote(i1)
 declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 7ae4766..3a2efad 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 62e50b2..d0c0b62 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
index 2a7206c..bc20665 100644
--- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -O0 -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope  -check-prefix=GCNNOOPT -check-prefix=GCN %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s
 
diff --git a/llvm/test/CodeGen/AMDGPU/early-term.mir b/llvm/test/CodeGen/AMDGPU/early-term.mir
index 99ea18c..77bc972 100644
--- a/llvm/test/CodeGen/AMDGPU/early-term.mir
+++ b/llvm/test/CodeGen/AMDGPU/early-term.mir
@@ -1,6 +1,6 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-late-branch-lowering -verify-machineinstrs  %s -o - | FileCheck -check-prefixes=GCN,GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs  %s -o - | FileCheck -check-prefixes=GCN,GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs  %s -o - | FileCheck -check-prefixes=GCN,GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs  %s -o - | FileCheck -check-prefixes=GCN,GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs  %s -o - | FileCheck -check-prefixes=GCN,GFX11 %s
 
 --- |
   define amdgpu_ps void @early_term_scc0_end_block() {
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index c05f9c6..600c35b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
 
 declare float @div.float.value()
 declare double @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 46f0bb0..954d810 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32, -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
 
 declare float @div.float.value()
 declare float @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index bd5e589..4ab8683 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
 
 declare float @div.float.value()
 declare float @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 5ffa71d3..cbe243a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
 
 declare float @div.float.value()
 declare double @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
index c17feae..e2635fc 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
@@ -1,6 +1,6 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx902 -mattr=+xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64,-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64,-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s
 
 # GCN-LABEL: name: break_smem_clause_simple_load_smrd8_ptr_hidden_bundle
 # GCN: bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
index 8d02f06..401f6e30 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
@@ -1,6 +1,6 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx902 -mattr=+xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64,-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64,-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s
 
 # GCN-LABEL: name: break_smem_clause_max_look_ahead_in_bundle
 # GCN:          S_LOAD_DWORDX2_IMM
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll b/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll
index 1cb27dc..1801082 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll
@@ -6,13 +6,13 @@
 ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic < %s | FileCheck -check-prefix=CU %s
 ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic < %s | FileCheck -check-prefix=CU %s
 
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
 
 ; Checks 10.1, 10.3 and 11 generic targets allow cumode/wave64.
 
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-wavefrontsize.ll
index 3dc9f20..431ba38f 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-wavefrontsize.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-32 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-64 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-32 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GCN,GFX10-32 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GCN,GFX10-32 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-64 %s
 
 ; GCN:      amdhsa.kernels:
 ; GCN:      .name: wavefrontsize
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index 487e62b..3747620 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -5,10 +5,10 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA-VI %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -S --sd --syms - | FileCheck --check-prefix=ELF %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri --amdhsa-code-object-version=4 | llvm-readobj -S --sd --syms - | FileCheck %s --check-prefix=ELF
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
 
 ; The SHT_NOTE section contains the output from the .hsa_code_object_*
 ; directives.
diff --git a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
index b9269e2..2cb440b 100644
--- a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH1024K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH1024K %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH2048K %s
 
 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
index 18bc442..266da50 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-delay-alu %s -o - | FileCheck %s
 
 ---
 name: valu_dep_1
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir
index af2e600..af7ba4e 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -start-after=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -start-after=amdgpu-insert-delay-alu %s -o - | FileCheck %s
 
 ---
 name: valu_dep_1
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
index 0bdd249..5c9c0d1 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
 
 ---
 name: valu_dep_1
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
index 1fd0c67..05fd883 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
 
 declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
 declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
index fa447f9..28492f1 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
 
 declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
 declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
index ab160ff..acd48a6 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
 
 ; Test formal argument lowering as well as calls to amdgpu_gfx functions.
 
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index ba946fe..c62b4e5 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF32 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF64 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF64 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF64 %s
 
 ; We only care about which physical registers the parameters are copied from;
 ; the function bodies are just some arbitrary uses.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index 3ff34c0..047b35b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck %s
 
 declare i32 @llvm.amdgcn.ballot.i32(i1)
 declare i32 @llvm.ctpop.i32(i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
index 0e659b7..88e3929 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
 
 declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1))
 declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
index d941830..a5841e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
 
 declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1))
 declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
index 3a540bd..b61ca56 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}full_mask:
 ; GCN: s_mov_b64 exec, -1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
index d3961c0..d4ae040 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
@@ -1,11 +1,11 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
 
 ; GCN-LABEL: {{^}}test_init_exec:
 ; GFX1032: s_mov_b32 exec_lo, 0x12345
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
index 63d27260..e0dacd4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
 
 ; GCN-LABEL: {{^}}gs_const:
 ; GCN-NOT: v_cmpx
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
index a538364..3d5e6dc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
 
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
index 368d637..d11aaf0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
 
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16>, <16 x i16>, <4 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
index d37b4b4..684ca3a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s
 
 define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
 ; SI-LABEL: static_exact:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index 5cb57ee..9e2e37a 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -17,6 +17,9 @@ declare void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) nocapture, ptr addrspace(3)
 
 declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1
 declare void @llvm.memmove.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
+declare void @llvm.memmove.p0.p3.i32(ptr nocapture writeonly, ptr addrspace(3) nocapture readonly, i32, i1 immarg) #1
+declare void @llvm.memmove.p3.p0.i32(ptr addrspace(3) nocapture writeonly, ptr nocapture readonly, i32, i1 immarg) #1
+declare void @llvm.memmove.p3.p3.i32(ptr addrspace(3) nocapture writeonly, ptr addrspace(3) nocapture readonly, i32, i1 immarg) #1
 declare void @llvm.memmove.p5.p5.i32(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1
 declare void @llvm.memmove.p3.p5.i32(ptr addrspace(3) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1
 declare void @llvm.memmove.p5.p3.i32(ptr addrspace(5) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
@@ -87,30 +90,25 @@ define amdgpu_kernel void @max_size_small_static_memmove_caller0(ptr addrspace(1
 ;
 ; ALL-LABEL: @max_size_small_static_memmove_caller0(
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 1024, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 1024, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP1]], 1
-; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr addrspace(1) [[TMP2]], align 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store i8 [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr addrspace(1) [[TMP5]], align 1
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store i8 [[ELEMENT2]], ptr addrspace(1) [[TMP6]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 1024
-; ALL-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 1
+; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 64
+; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
@@ -121,30 +119,37 @@ define amdgpu_kernel void @max_size_small_static_memmove_caller0(ptr addrspace(1
 define amdgpu_kernel void @min_size_large_static_memmove_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; OPT-LABEL: @min_size_large_static_memmove_caller0(
 ; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
-; OPT-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 1025, 0
-; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; OPT:       copy_backwards:
-; OPT-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; OPT:       copy_backwards_loop:
-; OPT-NEXT:    [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 1025, [[COPY_BACKWARDS]] ]
-; OPT-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP1]], 1
-; OPT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR]]
-; OPT-NEXT:    [[ELEMENT:%.*]] = load i8, ptr addrspace(1) [[TMP2]], align 1
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR]]
-; OPT-NEXT:    store i8 [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
-; OPT-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; OPT-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; OPT:       copy_forward:
-; OPT-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; OPT:       copy_forward_loop:
-; OPT-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR1]]
-; OPT-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr addrspace(1) [[TMP5]], align 1
-; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR1]]
-; OPT-NEXT:    store i8 [[ELEMENT2]], ptr addrspace(1) [[TMP6]], align 1
-; OPT-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; OPT-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 1025
-; OPT-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; OPT:       memmove_bwd_residual:
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT:    [[TMP2:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 1
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT:    store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
+; OPT-NEXT:    br label [[MEMMOVE_BWD_LOOP:%.*]]
+; OPT:       memmove_bwd_loop:
+; OPT-NEXT:    [[TMP4:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[MEMMOVE_BWD_RESIDUAL]] ]
+; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP4]], 1
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; OPT:       memmove_fwd_loop:
+; OPT-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP10:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ]
+; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP8]], align 1
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP9]], align 1
+; OPT-NEXT:    [[TMP10]] = add i64 [[FWD_INDEX]], 1
+; OPT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[TMP10]], 64
+; OPT-NEXT:    br i1 [[TMP11]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]]
+; OPT:       memmove_fwd_residual:
+; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT:    [[TMP13:%.*]] = load i8, ptr addrspace(1) [[TMP12]], align 1
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT:    store i8 [[TMP13]], ptr addrspace(1) [[TMP14]], align 1
+; OPT-NEXT:    br label [[MEMMOVE_DONE]]
 ; OPT:       memmove_done:
 ; OPT-NEXT:    ret void
 ;
@@ -1333,30 +1338,25 @@ define amdgpu_kernel void @memmove_flat_align1_global_align1(ptr %dst, ptr addrs
 ; ALL-LABEL: @memmove_flat_align1_global_align1(
 ; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[SRC:%.*]] to ptr
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store i8 [[ELEMENT]], ptr [[TMP4]], align 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
-; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
@@ -1372,30 +1372,25 @@ define amdgpu_kernel void @memmove_global_align1_flat_align1(ptr addrspace(1) %d
 ; ALL-LABEL: @memmove_global_align1_flat_align1(
 ; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[DST:%.*]] to ptr
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store i8 [[ELEMENT]], ptr [[TMP4]], align 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
-; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
@@ -1411,30 +1406,25 @@ define amdgpu_kernel void @memmove_flat_align1_private_align1(ptr %dst, ptr addr
 ; ALL-LABEL: @memmove_flat_align1_private_align1(
 ; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[SRC:%.*]] to ptr
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store i8 [[ELEMENT]], ptr [[TMP4]], align 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
-; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
@@ -1450,30 +1440,25 @@ define amdgpu_kernel void @memmove_private_align1_flat_align1(ptr addrspace(5) %
 ; ALL-LABEL: @memmove_private_align1_flat_align1(
 ; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DST:%.*]] to ptr
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store i8 [[ELEMENT]], ptr [[TMP4]], align 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
-; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
@@ -1734,6 +1719,477 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
   ret void
 }
 
+
+define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %dst, ptr addrspace(3) %src) {
+; MAX1024-LABEL: @memmove_flat_align1_local_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p0.p3.i32(ptr [[DST:%.*]], ptr addrspace(3) [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_flat_align1_local_align1(
+; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(3) [[SRC:%.*]] to ptr
+; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i32 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 32
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; ALL:       memmove_done:
+; ALL-NEXT:    ret void
+;
+  call void @llvm.memmove.p0.p3.i32(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 256, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size) {
+; OPT-LABEL: @memmove_flat_align1_local_align1_unknown_size(
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(3) [[SRC:%.*]] to ptr
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP4]], [[DST:%.*]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
+; OPT:       memmove_copy_backwards:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_bwd_residual_loop:
+; OPT-NEXT:    [[TMP5:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
+; OPT-NEXT:    [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP5]], 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load i8, ptr addrspace(3) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT]], ptr [[TMP7]], align 1
+; OPT-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
+; OPT-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
+; OPT:       memmove_bwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_bwd_main_loop:
+; OPT-NEXT:    [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 1
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP10]], align 1
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT1]], ptr [[TMP11]], align 1
+; OPT-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
+; OPT:       memmove_copy_forward:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_fwd_main_loop:
+; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP13]], align 1
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT2]], ptr [[TMP14]], align 1
+; OPT-NEXT:    [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 1
+; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP1]]
+; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
+; OPT:       memmove_fwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_fwd_residual_loop:
+; OPT-NEXT:    [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
+; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT3:%.*]] = load i8, ptr addrspace(3) [[TMP17]], align 1
+; OPT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT3]], ptr [[TMP18]], align 1
+; OPT-NEXT:    [[TMP19]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
+; OPT-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP19]], [[SIZE]]
+; OPT-NEXT:    br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p0.p3.i32(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %dst, ptr addrspace(0) %src) {
+; MAX1024-LABEL: @memmove_local_align1_flat_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) [[DST:%.*]], ptr [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_local_align1_flat_align1(
+; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(3) [[DST:%.*]] to ptr
+; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i32 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 32
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; ALL:       memmove_done:
+; ALL-NEXT:    ret void
+;
+  call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 256, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size) {
+; OPT-LABEL: @memmove_local_align1_flat_align1_unknown_size(
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(3) [[DST:%.*]] to ptr
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP4]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
+; OPT:       memmove_copy_backwards:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_bwd_residual_loop:
+; OPT-NEXT:    [[TMP5:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
+; OPT-NEXT:    [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP5]], 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT]], ptr addrspace(3) [[TMP7]], align 1
+; OPT-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
+; OPT-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
+; OPT:       memmove_bwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_bwd_main_loop:
+; OPT-NEXT:    [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 1
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP10]], align 1
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP11]], align 1
+; OPT-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
+; OPT:       memmove_copy_forward:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_fwd_main_loop:
+; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT2:%.*]] = load <2 x i32>, ptr [[TMP13]], align 1
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP14]], align 1
+; OPT-NEXT:    [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 1
+; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP1]]
+; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
+; OPT:       memmove_fwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_fwd_residual_loop:
+; OPT-NEXT:    [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
+; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT3:%.*]] = load i8, ptr [[TMP17]], align 1
+; OPT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT3]], ptr addrspace(3) [[TMP18]], align 1
+; OPT-NEXT:    [[TMP19]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
+; OPT-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP19]], [[SIZE]]
+; OPT-NEXT:    br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %dst, ptr addrspace(3) %src) {
+; MAX1024-LABEL: @memmove_local_align1_local_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) [[DST:%.*]], ptr addrspace(3) [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_local_align1_local_align1(
+; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[DST:%.*]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP1]], 1
+; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP2]], align 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7]] = add i32 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 32
+; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; ALL:       memmove_done:
+; ALL-NEXT:    ret void
+;
+  call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 256, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size) {
+; OPT-LABEL: @memmove_local_align1_local_align1_unknown_size(
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[DST:%.*]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
+; OPT:       memmove_copy_backwards:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_bwd_residual_loop:
+; OPT-NEXT:    [[TMP4:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
+; OPT-NEXT:    [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP4]], 1
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load i8, ptr addrspace(3) [[TMP5]], align 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT]], ptr addrspace(3) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
+; OPT-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
+; OPT:       memmove_bwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_bwd_main_loop:
+; OPT-NEXT:    [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 1
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP9]], align 1
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP10]], align 1
+; OPT-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
+; OPT:       memmove_copy_forward:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_fwd_main_loop:
+; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
+; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP12]], align 1
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP13]], align 1
+; OPT-NEXT:    [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 1
+; OPT-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP1]]
+; OPT-NEXT:    br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
+; OPT:       memmove_fwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_fwd_residual_loop:
+; OPT-NEXT:    [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP18:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
+; OPT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT3:%.*]] = load i8, ptr addrspace(3) [[TMP16]], align 1
+; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT3]], ptr addrspace(3) [[TMP17]], align 1
+; OPT-NEXT:    [[TMP18]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
+; OPT-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], [[SIZE]]
+; OPT-NEXT:    br i1 [[TMP19]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_private_align1_private_align1(ptr addrspace(5) %dst, ptr addrspace(5) %src) {
+; MAX1024-LABEL: @memmove_private_align1_private_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_private_align1_private_align1(
+; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[DST:%.*]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP1]], 1
+; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP2]], align 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP5]], align 1
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7]] = add i32 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 16
+; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; ALL:       memmove_done:
+; ALL-NEXT:    ret void
+;
+  call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 256, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_private_align1_private_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 %size) {
+; OPT-LABEL: @memmove_private_align1_private_align1_unknown_size(
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 15
+; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[DST:%.*]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
+; OPT:       memmove_copy_backwards:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_bwd_residual_loop:
+; OPT-NEXT:    [[TMP4:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
+; OPT-NEXT:    [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP4]], 1
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load i8, ptr addrspace(5) [[TMP5]], align 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT]], ptr addrspace(5) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
+; OPT-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
+; OPT:       memmove_bwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_bwd_main_loop:
+; OPT-NEXT:    [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 1
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP9]], align 1
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP10]], align 1
+; OPT-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
+; OPT:       memmove_copy_forward:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_fwd_main_loop:
+; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
+; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP12]], align 1
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT2]], ptr addrspace(5) [[TMP13]], align 1
+; OPT-NEXT:    [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 1
+; OPT-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP1]]
+; OPT-NEXT:    br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
+; OPT:       memmove_fwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_fwd_residual_loop:
+; OPT-NEXT:    [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP18:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
+; OPT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT3:%.*]] = load i8, ptr addrspace(5) [[TMP16]], align 1
+; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT3]], ptr addrspace(5) [[TMP17]], align 1
+; OPT-NEXT:    [[TMP18]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
+; OPT-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], [[SIZE]]
+; OPT-NEXT:    br i1 [[TMP19]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 %size, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_global_align4_static_residual_empty(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
+; OPT-LABEL: @memmove_global_align4_static_residual_empty(
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; OPT:       memmove_bwd_loop:
+; OPT-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 65, [[TMP0:%.*]] ]
+; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 1
+; OPT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
+; OPT-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; OPT:       memmove_fwd_loop:
+; OPT-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 1
+; OPT-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 65
+; OPT-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1040, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_global_align4_static_residual_full(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
+; OPT-LABEL: @memmove_global_align4_static_residual_full(
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; OPT:       memmove_bwd_residual:
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
+; OPT-NEXT:    [[TMP2:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 1
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038
+; OPT-NEXT:    store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
+; OPT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC]], i64 518
+; OPT-NEXT:    [[TMP5:%.*]] = load i16, ptr addrspace(1) [[TMP4]], align 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST]], i64 518
+; OPT-NEXT:    store i16 [[TMP5]], ptr addrspace(1) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[SRC]], i64 258
+; OPT-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 1
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DST]], i64 258
+; OPT-NEXT:    store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 1
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC]], i64 128
+; OPT-NEXT:    [[TMP11:%.*]] = load i64, ptr addrspace(1) [[TMP10]], align 1
+; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST]], i64 128
+; OPT-NEXT:    store i64 [[TMP11]], ptr addrspace(1) [[TMP12]], align 1
+; OPT-NEXT:    br label [[MEMMOVE_BWD_LOOP:%.*]]
+; OPT:       memmove_bwd_loop:
+; OPT-NEXT:    [[TMP13:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[MEMMOVE_BWD_RESIDUAL]] ]
+; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP13]], 1
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP14]], align 1
+; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP15]], align 1
+; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; OPT:       memmove_fwd_loop:
+; OPT-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ]
+; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP17]], align 1
+; OPT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP18]], align 1
+; OPT-NEXT:    [[TMP19]] = add i64 [[FWD_INDEX]], 1
+; OPT-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 64
+; OPT-NEXT:    br i1 [[TMP20]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]]
+; OPT:       memmove_fwd_residual:
+; OPT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC]], i64 128
+; OPT-NEXT:    [[TMP22:%.*]] = load i64, ptr addrspace(1) [[TMP21]], align 1
+; OPT-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST]], i64 128
+; OPT-NEXT:    store i64 [[TMP22]], ptr addrspace(1) [[TMP23]], align 1
+; OPT-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[SRC]], i64 258
+; OPT-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(1) [[TMP24]], align 1
+; OPT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DST]], i64 258
+; OPT-NEXT:    store i32 [[TMP25]], ptr addrspace(1) [[TMP26]], align 1
+; OPT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC]], i64 518
+; OPT-NEXT:    [[TMP28:%.*]] = load i16, ptr addrspace(1) [[TMP27]], align 1
+; OPT-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST]], i64 518
+; OPT-NEXT:    store i16 [[TMP28]], ptr addrspace(1) [[TMP29]], align 1
+; OPT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
+; OPT-NEXT:    [[TMP31:%.*]] = load i8, ptr addrspace(1) [[TMP30]], align 1
+; OPT-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038
+; OPT-NEXT:    store i8 [[TMP31]], ptr addrspace(1) [[TMP32]], align 1
+; OPT-NEXT:    br label [[MEMMOVE_DONE]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1039, i1 false)
+  ret void
+}
+
 define void @test_umin(i64 %0, i64 %idxprom, ptr %x, ptr %y) {
 ; OPT-LABEL: @test_umin(
 ; OPT-NEXT:  entry:
@@ -1783,30 +2239,25 @@ define amdgpu_kernel void @memmove_volatile(ptr addrspace(1) %dst, ptr addrspace
 ;
 ; ALL-LABEL: @memmove_volatile(
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 64, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 64, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP1]], 1
-; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load volatile i8, ptr addrspace(1) [[TMP2]], align 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store volatile i8 [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load volatile i8, ptr addrspace(1) [[TMP5]], align 1
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store volatile i8 [[ELEMENT2]], ptr addrspace(1) [[TMP6]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 64
-; ALL-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 4, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 1
+; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store volatile <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store volatile <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 4
+; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
index b5ee668..06ebd86 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX9_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0
 
 ; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 162c47f..2591ff4 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX9_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0
 
 ; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
index ece2e1b..c702de6 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
@@ -1,10 +1,10 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,ADDR64
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32
 
 # Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
 #
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index 92b2f51..10d0803 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx600 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
 
 ; Add an extra verifier runs. There were some cases where invalid IR
diff --git a/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir b/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir
index 9ad1660..c1121f9 100644
--- a/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir
+++ b/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir
@@ -1,5 +1,5 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,WAVE32 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,WAVE64 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,WAVE64 %s
 
 ---
 
@@ -124,4 +124,4 @@ body: |
   bb.0:
     liveins: $sgpr0, $sgpr1
     renamable $sgpr1 = S_MOV_B32 renamable $sgpr0
-    renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec
-\ No newline at end of file
+    renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
index f19b0a5..eaa3d22 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GCN64-MUBUF %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GCN32-MUBUF %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GCN32-MUBUF %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GCN64-FLATSCR %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -filetype=obj -verify-machineinstrs -start-before=prologepilog %s -o /dev/null
 # Check not crashing when emitting ISA
diff --git a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow-postra.mir b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow-postra.mir
index 266e673..ff87c28 100644
--- a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow-postra.mir
+++ b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow-postra.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass=postra-machine-sink -mattr=-wavefrontsize32,+wavefrontsize64 -o - %s | FileCheck -check-prefixes=GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass=postra-machine-sink -mattr=+wavefrontsize64 -o - %s | FileCheck -check-prefixes=GFX10 %s
 
 # Ensure that PostRA Machine Sink does not sink instructions
 # past block prologues which would overwrite their uses.
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index a440b87..eebd32c 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,GFX10-WAVE64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,GFX10-WAVE64 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX10-WAVE32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
 
 define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 {
 ; GCN-LABEL: test_kill_depth_0_imm_pos:
diff --git a/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir b/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir
index 0f69210..2f1eeb8 100644
--- a/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN %s
 
 # GCN-LABEL: name: hazard_smem_war
 # GCN:        S_LOAD_DWORD_IMM
diff --git a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
index 537aca1..639bf6a 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX11 %s
 
 --- |
   define amdgpu_kernel void @check_vcc() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
index 4b9b5f9..39c7538 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-wavefrontsize32,+wavefrontsize64 --global-isel=0 -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=+wavefrontsize64 --global-isel=0 -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK %s
 
 define amdgpu_kernel void @icmp_test() {
 ; CHECK-LABEL: icmp_test:
diff --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index e143611..17e3d93 100644
--- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -1,7 +1,7 @@
 # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=tahiti -o - %s | FileCheck %s -check-prefixes=CHECK,SI
 # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck %s -check-prefixes=CHECK,GFX9
-# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -o - %s | FileCheck %s
-# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -o - %s | FileCheck %s
+# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
+# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
 ---
 # CHECK-LABEL: name: vccz_corrupt_workaround
 # CHECK: $vcc = V_CMP_EQ_F32
diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
index 4592498..47f13cb 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1010 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1030 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1010 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1030 %s
 ---
 
 # After the Optimize exec masking (post-RA) pass, there's a change of having v_cmpx instructions
diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir
index 645d473..e97c518 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
 
 # GCN-LABEL: name: hazard_vcmpx_smov_exec_lo
 # GCN:        $sgpr0 = S_MOV_B32 $exec_lo
diff --git a/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir b/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir
index 81d17a8..76713dd 100644
--- a/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir
+++ b/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir
@@ -1,9 +1,9 @@
 # RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX9-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
 # RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX9-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
 
 # GFX9-ERR: *** Bad machine code: VOP* instruction violates constant bus restriction ***
 # GFX9-ERR: $vgpr0 = V_CNDMASK_B32_e64 0, $sgpr0, 0, -1, killed $sgpr0_sgpr1, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir b/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir
index 6c55183..bbd0817 100644
--- a/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir
@@ -1,5 +1,5 @@
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
 
 # GFX12-ERR: *** Bad machine code: VOP* instruction violates constant bus restriction ***
 # GFX12-ERR: $vgpr2, $vgpr3 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx12 $sgpr0, $vgpr0, $sgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc_lo, implicit $vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/verify-vopd.mir b/llvm/test/CodeGen/AMDGPU/verify-vopd.mir
index dc7d4af..b310f55 100644
--- a/llvm/test/CodeGen/AMDGPU/verify-vopd.mir
+++ b/llvm/test/CodeGen/AMDGPU/verify-vopd.mir
@@ -1,5 +1,5 @@
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
 
 # GFX11-ERR: *** Bad machine code: VOP* instruction violates constant bus restriction ***
 # GFX11-ERR: $vgpr2, $vgpr3 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $sgpr0, $vgpr0, $sgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc_lo, implicit $vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
index ab1121a..645f498 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
index 829d770..ad1d66e 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
index 161d222..c52b079 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
index 511a116..ebbb3d1 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
index 5fde11c..0f8df5a 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
index 277db33..4073964 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
 
 # D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
 #  $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
index 801bb06..2efbd32 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s
 
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>)
 
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
index 12b6598..ac218a2 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
 
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
index b43969d..9ec2d8f 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
 
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16>, <16 x i16>, <4 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 11003c4..8f052ef 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10-W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GFX10-W32 %s
 
 ; Check that WQM isn't triggered by image load/store intrinsics.
 define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {
diff --git a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
index afa7fde..59e755a 100644
--- a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -154,7 +154,7 @@ entry:
 ; IR-LABEL:   @memmove_caller
 ; IR:         icmp ult ptr %src, %dst
 ; IR:         [[PHIVAL:%[0-9a-zA-Z_]+]] = phi i64
-; IR-NEXT:    %index_ptr = sub i64 [[PHIVAL]], 1
+; IR-NEXT:    %bwd_main_index = sub i64 [[PHIVAL]], 1
 ; IR:         [[FWDPHIVAL:%[0-9a-zA-Z_]+]] = phi i64
 ; IR:         {{%[0-9a-zA-Z_]+}} = add i64 [[FWDPHIVAL]], 1
 
diff --git a/llvm/test/CodeGen/PowerPC/common-chain.ll b/llvm/test/CodeGen/PowerPC/common-chain.ll
index 5f8c21e..ccf0e45 100644
--- a/llvm/test/CodeGen/PowerPC/common-chain.ll
+++ b/llvm/test/CodeGen/PowerPC/common-chain.ll
@@ -642,8 +642,8 @@ define i64 @two_chain_two_bases_succ(ptr %p, i64 %offset, i64 %base1, i64 %base2
 ; CHECK-NEXT:    cmpdi r7, 0
 ; CHECK-NEXT:    ble cr0, .LBB6_4
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    add r6, r6, r4
 ; CHECK-NEXT:    add r5, r5, r4
+; CHECK-NEXT:    add r6, r6, r4
 ; CHECK-NEXT:    mtctr r7
 ; CHECK-NEXT:    sldi r4, r4, 1
 ; CHECK-NEXT:    add r5, r3, r5
@@ -743,214 +743,219 @@ define signext i32 @spill_reduce_succ(ptr %input1, ptr %input2, ptr %output, i64
 ; CHECK-NEXT:    std r9, -184(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r8, -176(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r7, -168(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    std r3, -160(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    std r4, -160(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    ble cr0, .LBB7_7
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    sldi r6, r6, 2
-; CHECK-NEXT:    li r7, 1
-; CHECK-NEXT:    mr r30, r10
-; CHECK-NEXT:    cmpdi r6, 1
-; CHECK-NEXT:    iselgt r7, r6, r7
-; CHECK-NEXT:    addi r8, r7, -1
-; CHECK-NEXT:    clrldi r6, r7, 63
-; CHECK-NEXT:    cmpldi r8, 3
+; CHECK-NEXT:    sldi r4, r6, 2
+; CHECK-NEXT:    li r6, 1
+; CHECK-NEXT:    mr r0, r10
+; CHECK-NEXT:    std r10, -192(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    cmpdi r4, 1
+; CHECK-NEXT:    iselgt r4, r4, r6
+; CHECK-NEXT:    addi r7, r4, -1
+; CHECK-NEXT:    clrldi r6, r4, 63
+; CHECK-NEXT:    cmpldi r7, 3
 ; CHECK-NEXT:    blt cr0, .LBB7_4
 ; CHECK-NEXT:  # %bb.2: # %for.body.preheader.new
-; CHECK-NEXT:    ld r14, -168(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    mulli r24, r30, 24
-; CHECK-NEXT:    ld r16, -184(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld r15, -176(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld r3, -160(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    rldicl r0, r7, 62, 2
-; CHECK-NEXT:    sldi r11, r30, 5
-; CHECK-NEXT:    sldi r19, r30, 4
-; CHECK-NEXT:    sldi r7, r14, 3
-; CHECK-NEXT:    add r14, r30, r14
-; CHECK-NEXT:    sldi r10, r16, 3
-; CHECK-NEXT:    sldi r12, r15, 3
-; CHECK-NEXT:    add r16, r30, r16
-; CHECK-NEXT:    add r15, r30, r15
-; CHECK-NEXT:    add r27, r11, r7
-; CHECK-NEXT:    add r22, r24, r7
-; CHECK-NEXT:    add r17, r19, r7
-; CHECK-NEXT:    sldi r2, r14, 3
-; CHECK-NEXT:    add r26, r24, r10
-; CHECK-NEXT:    add r25, r24, r12
-; CHECK-NEXT:    add r21, r19, r10
-; CHECK-NEXT:    add r20, r19, r12
-; CHECK-NEXT:    add r8, r11, r10
-; CHECK-NEXT:    sldi r16, r16, 3
-; CHECK-NEXT:    add r29, r5, r27
-; CHECK-NEXT:    add r28, r4, r27
-; CHECK-NEXT:    add r27, r3, r27
-; CHECK-NEXT:    add r24, r5, r22
-; CHECK-NEXT:    add r23, r4, r22
-; CHECK-NEXT:    add r22, r3, r22
-; CHECK-NEXT:    add r19, r5, r17
-; CHECK-NEXT:    add r18, r4, r17
-; CHECK-NEXT:    add r17, r3, r17
-; CHECK-NEXT:    add r14, r5, r2
-; CHECK-NEXT:    add r31, r4, r2
-; CHECK-NEXT:    add r2, r3, r2
-; CHECK-NEXT:    add r9, r5, r8
-; CHECK-NEXT:    add r8, r11, r12
+; CHECK-NEXT:    ld r0, -192(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r30, -184(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r8, -176(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    rldicl r7, r4, 62, 2
+; CHECK-NEXT:    ld r9, -168(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    add r11, r0, r30
+; CHECK-NEXT:    add r4, r0, r0
+; CHECK-NEXT:    mulli r23, r0, 24
+; CHECK-NEXT:    add r14, r0, r8
+; CHECK-NEXT:    sldi r12, r0, 5
+; CHECK-NEXT:    add r31, r0, r9
+; CHECK-NEXT:    sldi r9, r9, 3
+; CHECK-NEXT:    sldi r18, r0, 4
+; CHECK-NEXT:    sldi r8, r8, 3
+; CHECK-NEXT:    add r10, r4, r4
+; CHECK-NEXT:    sldi r4, r30, 3
+; CHECK-NEXT:    sldi r11, r11, 3
+; CHECK-NEXT:    add r26, r12, r9
+; CHECK-NEXT:    add r16, r18, r9
+; CHECK-NEXT:    add r29, r12, r8
+; CHECK-NEXT:    add r19, r18, r8
+; CHECK-NEXT:    add r30, r12, r4
+; CHECK-NEXT:    mr r20, r4
+; CHECK-NEXT:    std r4, -200(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    ld r4, -160(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    add r15, r5, r11
+; CHECK-NEXT:    sldi r11, r14, 3
+; CHECK-NEXT:    add r29, r5, r29
+; CHECK-NEXT:    add r28, r3, r26
+; CHECK-NEXT:    add r19, r5, r19
+; CHECK-NEXT:    add r21, r23, r9
+; CHECK-NEXT:    add r24, r23, r8
+; CHECK-NEXT:    add r14, r5, r11
+; CHECK-NEXT:    sldi r11, r31, 3
+; CHECK-NEXT:    add r25, r23, r20
+; CHECK-NEXT:    add r20, r18, r20
+; CHECK-NEXT:    add r30, r5, r30
+; CHECK-NEXT:    add r18, r3, r16
+; CHECK-NEXT:    add r24, r5, r24
+; CHECK-NEXT:    add r23, r3, r21
+; CHECK-NEXT:    add r27, r4, r26
+; CHECK-NEXT:    add r22, r4, r21
+; CHECK-NEXT:    add r17, r4, r16
+; CHECK-NEXT:    add r2, r4, r11
+; CHECK-NEXT:    rldicl r4, r7, 2, 1
+; CHECK-NEXT:    sub r7, r8, r9
+; CHECK-NEXT:    ld r8, -200(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    add r26, r5, r26
 ; CHECK-NEXT:    add r25, r5, r25
 ; CHECK-NEXT:    add r21, r5, r21
 ; CHECK-NEXT:    add r20, r5, r20
 ; CHECK-NEXT:    add r16, r5, r16
-; CHECK-NEXT:    add r8, r5, r8
-; CHECK-NEXT:    rldicl r3, r0, 2, 1
-; CHECK-NEXT:    addi r3, r3, -4
-; CHECK-NEXT:    sub r0, r12, r7
-; CHECK-NEXT:    sub r12, r10, r7
-; CHECK-NEXT:    li r7, 0
-; CHECK-NEXT:    mr r10, r30
-; CHECK-NEXT:    sldi r15, r15, 3
-; CHECK-NEXT:    add r15, r5, r15
-; CHECK-NEXT:    rldicl r3, r3, 62, 2
-; CHECK-NEXT:    addi r3, r3, 1
-; CHECK-NEXT:    mtctr r3
+; CHECK-NEXT:    add r31, r5, r11
+; CHECK-NEXT:    add r11, r3, r11
+; CHECK-NEXT:    addi r4, r4, -4
+; CHECK-NEXT:    rldicl r4, r4, 62, 2
+; CHECK-NEXT:    sub r8, r8, r9
+; CHECK-NEXT:    li r9, 0
+; CHECK-NEXT:    addi r4, r4, 1
+; CHECK-NEXT:    mtctr r4
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB7_3: # %for.body
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lfd f0, 0(r2)
-; CHECK-NEXT:    lfd f1, 0(r31)
-; CHECK-NEXT:    add r3, r10, r30
-; CHECK-NEXT:    add r3, r3, r30
+; CHECK-NEXT:    lfd f0, 0(r11)
+; CHECK-NEXT:    lfd f1, 0(r2)
+; CHECK-NEXT:    add r0, r0, r10
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfd f1, 0(r14)
-; CHECK-NEXT:    add r3, r3, r30
-; CHECK-NEXT:    add r10, r3, r30
+; CHECK-NEXT:    lfd f1, 0(r31)
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfd f0, 0(r14)
-; CHECK-NEXT:    add r14, r14, r11
-; CHECK-NEXT:    lfdx f0, r2, r0
-; CHECK-NEXT:    lfdx f1, r31, r0
+; CHECK-NEXT:    stfd f0, 0(r31)
+; CHECK-NEXT:    add r31, r31, r12
+; CHECK-NEXT:    lfdx f0, r11, r7
+; CHECK-NEXT:    lfdx f1, r2, r7
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r15, r7
+; CHECK-NEXT:    lfdx f1, r14, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r15, r7
-; CHECK-NEXT:    lfdx f0, r2, r12
-; CHECK-NEXT:    lfdx f1, r31, r12
-; CHECK-NEXT:    add r2, r2, r11
-; CHECK-NEXT:    add r31, r31, r11
+; CHECK-NEXT:    stfdx f0, r14, r9
+; CHECK-NEXT:    lfdx f0, r11, r8
+; CHECK-NEXT:    lfdx f1, r2, r8
+; CHECK-NEXT:    add r11, r11, r12
+; CHECK-NEXT:    add r2, r2, r12
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r16, r7
+; CHECK-NEXT:    lfdx f1, r15, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r16, r7
-; CHECK-NEXT:    lfd f0, 0(r17)
-; CHECK-NEXT:    lfd f1, 0(r18)
+; CHECK-NEXT:    stfdx f0, r15, r9
+; CHECK-NEXT:    lfd f0, 0(r18)
+; CHECK-NEXT:    lfd f1, 0(r17)
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r19, r7
+; CHECK-NEXT:    lfdx f1, r16, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r19, r7
-; CHECK-NEXT:    lfdx f0, r17, r0
-; CHECK-NEXT:    lfdx f1, r18, r0
+; CHECK-NEXT:    stfdx f0, r16, r9
+; CHECK-NEXT:    lfdx f0, r18, r7
+; CHECK-NEXT:    lfdx f1, r17, r7
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r20, r7
+; CHECK-NEXT:    lfdx f1, r19, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r20, r7
-; CHECK-NEXT:    lfdx f0, r17, r12
-; CHECK-NEXT:    lfdx f1, r18, r12
-; CHECK-NEXT:    add r17, r17, r11
-; CHECK-NEXT:    add r18, r18, r11
+; CHECK-NEXT:    stfdx f0, r19, r9
+; CHECK-NEXT:    lfdx f0, r18, r8
+; CHECK-NEXT:    lfdx f1, r17, r8
+; CHECK-NEXT:    add r18, r18, r12
+; CHECK-NEXT:    add r17, r17, r12
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r21, r7
+; CHECK-NEXT:    lfdx f1, r20, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r21, r7
-; CHECK-NEXT:    lfd f0, 0(r22)
-; CHECK-NEXT:    lfd f1, 0(r23)
+; CHECK-NEXT:    stfdx f0, r20, r9
+; CHECK-NEXT:    lfd f0, 0(r23)
+; CHECK-NEXT:    lfd f1, 0(r22)
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r24, r7
+; CHECK-NEXT:    lfdx f1, r21, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r24, r7
-; CHECK-NEXT:    lfdx f0, r22, r0
-; CHECK-NEXT:    lfdx f1, r23, r0
+; CHECK-NEXT:    stfdx f0, r21, r9
+; CHECK-NEXT:    lfdx f0, r23, r7
+; CHECK-NEXT:    lfdx f1, r22, r7
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r25, r7
+; CHECK-NEXT:    lfdx f1, r24, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r25, r7
-; CHECK-NEXT:    lfdx f0, r22, r12
-; CHECK-NEXT:    lfdx f1, r23, r12
-; CHECK-NEXT:    add r22, r22, r11
-; CHECK-NEXT:    add r23, r23, r11
+; CHECK-NEXT:    stfdx f0, r24, r9
+; CHECK-NEXT:    lfdx f0, r23, r8
+; CHECK-NEXT:    lfdx f1, r22, r8
+; CHECK-NEXT:    add r23, r23, r12
+; CHECK-NEXT:    add r22, r22, r12
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r26, r7
+; CHECK-NEXT:    lfdx f1, r25, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r26, r7
-; CHECK-NEXT:    lfd f0, 0(r27)
-; CHECK-NEXT:    lfd f1, 0(r28)
+; CHECK-NEXT:    stfdx f0, r25, r9
+; CHECK-NEXT:    lfd f0, 0(r28)
+; CHECK-NEXT:    lfd f1, 0(r27)
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r29, r7
+; CHECK-NEXT:    lfdx f1, r26, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r29, r7
-; CHECK-NEXT:    lfdx f0, r27, r0
-; CHECK-NEXT:    lfdx f1, r28, r0
+; CHECK-NEXT:    stfdx f0, r26, r9
+; CHECK-NEXT:    lfdx f0, r28, r7
+; CHECK-NEXT:    lfdx f1, r27, r7
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r8, r7
+; CHECK-NEXT:    lfdx f1, r29, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r8, r7
-; CHECK-NEXT:    lfdx f0, r27, r12
-; CHECK-NEXT:    lfdx f1, r28, r12
-; CHECK-NEXT:    add r27, r27, r11
-; CHECK-NEXT:    add r28, r28, r11
+; CHECK-NEXT:    stfdx f0, r29, r9
+; CHECK-NEXT:    lfdx f0, r28, r8
+; CHECK-NEXT:    lfdx f1, r27, r8
+; CHECK-NEXT:    add r28, r28, r12
+; CHECK-NEXT:    add r27, r27, r12
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r9, r7
+; CHECK-NEXT:    lfdx f1, r30, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r9, r7
-; CHECK-NEXT:    add r7, r7, r11
+; CHECK-NEXT:    stfdx f0, r30, r9
+; CHECK-NEXT:    add r9, r9, r12
 ; CHECK-NEXT:    bdnz .LBB7_3
 ; CHECK-NEXT:  .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa
+; CHECK-NEXT:    ld r7, -192(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    cmpldi r6, 0
 ; CHECK-NEXT:    beq cr0, .LBB7_7
 ; CHECK-NEXT:  # %bb.5: # %for.body.epil.preheader
-; CHECK-NEXT:    ld r3, -184(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld r0, -160(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    sldi r8, r30, 3
-; CHECK-NEXT:    add r3, r10, r3
-; CHECK-NEXT:    sldi r3, r3, 3
-; CHECK-NEXT:    add r7, r5, r3
-; CHECK-NEXT:    add r9, r4, r3
-; CHECK-NEXT:    add r11, r0, r3
-; CHECK-NEXT:    ld r3, -176(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    add r3, r10, r3
-; CHECK-NEXT:    sldi r3, r3, 3
-; CHECK-NEXT:    add r12, r5, r3
-; CHECK-NEXT:    add r30, r4, r3
-; CHECK-NEXT:    add r29, r0, r3
-; CHECK-NEXT:    ld r3, -168(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    add r3, r10, r3
-; CHECK-NEXT:    li r10, 0
-; CHECK-NEXT:    sldi r3, r3, 3
-; CHECK-NEXT:    add r5, r5, r3
-; CHECK-NEXT:    add r4, r4, r3
-; CHECK-NEXT:    add r3, r0, r3
+; CHECK-NEXT:    ld r4, -184(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r29, -160(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    mr r30, r3
+; CHECK-NEXT:    sldi r7, r7, 3
+; CHECK-NEXT:    add r4, r0, r4
+; CHECK-NEXT:    sldi r4, r4, 3
+; CHECK-NEXT:    add r3, r5, r4
+; CHECK-NEXT:    add r8, r29, r4
+; CHECK-NEXT:    add r9, r30, r4
+; CHECK-NEXT:    ld r4, -176(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    add r4, r0, r4
+; CHECK-NEXT:    sldi r4, r4, 3
+; CHECK-NEXT:    add r10, r5, r4
+; CHECK-NEXT:    add r11, r29, r4
+; CHECK-NEXT:    add r12, r30, r4
+; CHECK-NEXT:    ld r4, -168(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    add r4, r0, r4
+; CHECK-NEXT:    sldi r0, r4, 3
+; CHECK-NEXT:    add r5, r5, r0
+; CHECK-NEXT:    add r4, r29, r0
+; CHECK-NEXT:    add r30, r30, r0
+; CHECK-NEXT:    li r0, 0
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB7_6: # %for.body.epil
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lfdx f0, r3, r10
-; CHECK-NEXT:    lfdx f1, r4, r10
+; CHECK-NEXT:    lfdx f0, r30, r0
+; CHECK-NEXT:    lfdx f1, r4, r0
 ; CHECK-NEXT:    addi r6, r6, -1
 ; CHECK-NEXT:    cmpldi r6, 0
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
 ; CHECK-NEXT:    lfd f1, 0(r5)
 ; CHECK-NEXT:    xsadddp f0, f1, f0
 ; CHECK-NEXT:    stfd f0, 0(r5)
-; CHECK-NEXT:    add r5, r5, r8
-; CHECK-NEXT:    lfdx f0, r29, r10
-; CHECK-NEXT:    lfdx f1, r30, r10
+; CHECK-NEXT:    add r5, r5, r7
+; CHECK-NEXT:    lfdx f0, r12, r0
+; CHECK-NEXT:    lfdx f1, r11, r0
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r12, r10
+; CHECK-NEXT:    lfdx f1, r10, r0
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r12, r10
-; CHECK-NEXT:    lfdx f0, r11, r10
-; CHECK-NEXT:    lfdx f1, r9, r10
+; CHECK-NEXT:    stfdx f0, r10, r0
+; CHECK-NEXT:    lfdx f0, r9, r0
+; CHECK-NEXT:    lfdx f1, r8, r0
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r7, r10
+; CHECK-NEXT:    lfdx f1, r3, r0
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r7, r10
-; CHECK-NEXT:    add r10, r10, r8
+; CHECK-NEXT:    stfdx f0, r3, r0
+; CHECK-NEXT:    add r0, r0, r7
 ; CHECK-NEXT:    bne cr0, .LBB7_6
 ; CHECK-NEXT:  .LBB7_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ld r2, -152(r1) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 3611d92..e305a74 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -90,7 +90,7 @@
 ; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       RISC-V DAG->DAG Pattern Instruction Selection
 ; CHECK-NEXT:       Finalize ISel and expand pseudo-instructions
-; CHECK-NEXT:       RISC-V Fold Masks
+; CHECK-NEXT:       RISC-V Vector Peephole Optimization
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Early Tail Duplication
 ; CHECK-NEXT:       Optimize machine instruction PHIs
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
index 3a99f53..cb50ca4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
@@ -97,3 +97,112 @@ define void @vfwmul_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a,
   store <2 x double> %g, ptr %z
   ret void
 }
+
+define void @vfwmacc_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a, <2 x float> %b, <2 x float> %b2, <2 x double> %w) {
+; NO_FOLDING-LABEL: vfwmacc_v2f32_multiple_users:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v12, v8
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v8, v9
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v9, v10
+; NO_FOLDING-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; NO_FOLDING-NEXT:    vfmul.vv v10, v12, v8
+; NO_FOLDING-NEXT:    vfmadd.vv v12, v9, v11
+; NO_FOLDING-NEXT:    vfsub.vv v8, v8, v9
+; NO_FOLDING-NEXT:    vse64.v v10, (a0)
+; NO_FOLDING-NEXT:    vse64.v v12, (a1)
+; NO_FOLDING-NEXT:    vse64.v v8, (a2)
+; NO_FOLDING-NEXT:    ret
+;
+; FOLDING-LABEL: vfwmacc_v2f32_multiple_users:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; FOLDING-NEXT:    vfwmul.vv v12, v8, v9
+; FOLDING-NEXT:    vfwmacc.vv v11, v8, v10
+; FOLDING-NEXT:    vfwsub.vv v8, v9, v10
+; FOLDING-NEXT:    vse64.v v12, (a0)
+; FOLDING-NEXT:    vse64.v v11, (a1)
+; FOLDING-NEXT:    vse64.v v8, (a2)
+; FOLDING-NEXT:    ret
+  %c = fpext <2 x float> %a to <2 x double>
+  %d = fpext <2 x float> %b to <2 x double>
+  %d2 = fpext <2 x float> %b2 to <2 x double>
+  %e = fmul <2 x double> %c, %d
+  %f = call <2 x double> @llvm.fma(<2 x double> %c, <2 x double> %d2, <2 x double> %w)
+  %g = fsub <2 x double> %d, %d2
+  store <2 x double> %e, ptr %x
+  store <2 x double> %f, ptr %y
+  store <2 x double> %g, ptr %z
+  ret void
+}
+
+; Negative test. We can't fold because the FMA addend is a user.
+define void @vfwmacc_v2f32_multiple_users_addend_user(ptr %x, ptr %y, ptr %z, <2 x float> %a, <2 x float> %b, <2 x float> %b2) {
+; NO_FOLDING-LABEL: vfwmacc_v2f32_multiple_users_addend_user:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v11, v8
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v8, v9
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v9, v10
+; NO_FOLDING-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; NO_FOLDING-NEXT:    vfmul.vv v10, v11, v8
+; NO_FOLDING-NEXT:    vfmadd.vv v11, v9, v8
+; NO_FOLDING-NEXT:    vfsub.vv v8, v8, v9
+; NO_FOLDING-NEXT:    vse64.v v10, (a0)
+; NO_FOLDING-NEXT:    vse64.v v11, (a1)
+; NO_FOLDING-NEXT:    vse64.v v8, (a2)
+; NO_FOLDING-NEXT:    ret
+;
+; FOLDING-LABEL: vfwmacc_v2f32_multiple_users_addend_user:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; FOLDING-NEXT:    vfwcvt.f.f.v v11, v8
+; FOLDING-NEXT:    vfwcvt.f.f.v v8, v9
+; FOLDING-NEXT:    vfwcvt.f.f.v v9, v10
+; FOLDING-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; FOLDING-NEXT:    vfmul.vv v10, v11, v8
+; FOLDING-NEXT:    vfmadd.vv v11, v9, v8
+; FOLDING-NEXT:    vfsub.vv v8, v8, v9
+; FOLDING-NEXT:    vse64.v v10, (a0)
+; FOLDING-NEXT:    vse64.v v11, (a1)
+; FOLDING-NEXT:    vse64.v v8, (a2)
+; FOLDING-NEXT:    ret
+  %c = fpext <2 x float> %a to <2 x double>
+  %d = fpext <2 x float> %b to <2 x double>
+  %d2 = fpext <2 x float> %b2 to <2 x double>
+  %e = fmul <2 x double> %c, %d
+  %f = call <2 x double> @llvm.fma(<2 x double> %c, <2 x double> %d2, <2 x double> %d)
+  %g = fsub <2 x double> %d, %d2
+  store <2 x double> %e, ptr %x
+  store <2 x double> %f, ptr %y
+  store <2 x double> %g, ptr %z
+  ret void
+}
+
+; Negative test. We can't fold because the FMA addend is a user.
+define void @vfwmacc_v2f32_addend_user(ptr %x, <2 x float> %a, <2 x float> %b) {
+; NO_FOLDING-LABEL: vfwmacc_v2f32_addend_user:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v10, v8
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v8, v9
+; NO_FOLDING-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; NO_FOLDING-NEXT:    vfmadd.vv v8, v10, v8
+; NO_FOLDING-NEXT:    vse64.v v8, (a0)
+; NO_FOLDING-NEXT:    ret
+;
+; FOLDING-LABEL: vfwmacc_v2f32_addend_user:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; FOLDING-NEXT:    vfwcvt.f.f.v v10, v8
+; FOLDING-NEXT:    vfwcvt.f.f.v v8, v9
+; FOLDING-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; FOLDING-NEXT:    vfmadd.vv v8, v10, v8
+; FOLDING-NEXT:    vse64.v v8, (a0)
+; FOLDING-NEXT:    ret
+  %c = fpext <2 x float> %a to <2 x double>
+  %d = fpext <2 x float> %b to <2 x double>
+  %f = call <2 x double> @llvm.fma(<2 x double> %c, <2 x double> %d, <2 x double> %d)
+  store <2 x double> %f, ptr %x
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
index 1803b52..5140d89 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
@@ -2031,11 +2031,8 @@ define <8 x double> @vfwnmsac_fv_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, h
 define <2 x float> @vfwmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
 ; CHECK-LABEL: vfwmacc_vf2_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.h fa5, fa0
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfmacc.vf v8, fa5, v10
+; CHECK-NEXT:    vfwmacc.vf v8, fa0, v9
 ; CHECK-NEXT:    ret
   %cext = fpext half %c to float
   %head = insertelement <2 x float> poison, float %cext, i32 0
@@ -2048,11 +2045,8 @@ define <2 x float> @vfwmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c)
 define <2 x float> @vfwmsac_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
 ; CHECK-LABEL: vfwmsac_vf2_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.h fa5, fa0
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfmsac.vf v8, fa5, v10
+; CHECK-NEXT:    vfwmsac.vf v8, fa0, v9
 ; CHECK-NEXT:    ret
   %cext = fpext half %c to float
   %head = insertelement <2 x float> poison, float %cext, i32 0
@@ -2066,11 +2060,8 @@ define <2 x float> @vfwmsac_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c)
 define <2 x float> @vfwnmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
 ; CHECK-LABEL: vfwnmacc_vf2_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.h fa5, fa0
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfnmacc.vf v8, fa5, v10
+; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v9
 ; CHECK-NEXT:    ret
   %cext = fpext half %c to float
   %head = insertelement <2 x float> poison, float %cext, i32 0
@@ -2085,11 +2076,8 @@ define <2 x float> @vfwnmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c)
 define <2 x float> @vfwnmsac_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
 ; CHECK-LABEL: vfwnmsac_vf2_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.h fa5, fa0
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfnmsac.vf v8, fa5, v10
+; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v9
 ; CHECK-NEXT:    ret
   %cext = fpext half %c to float
   %head = insertelement <2 x float> poison, float %cext, i32 0
diff --git a/llvm/test/CodeGen/SystemZ/vec-combine-01.ll b/llvm/test/CodeGen/SystemZ/vec-combine-01.ll
index 6f0abd6..16231b2 100644
--- a/llvm/test/CodeGen/SystemZ/vec-combine-01.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-combine-01.ll
@@ -153,3 +153,13 @@ define void @f7(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4) {
   store i8 %trunc3, ptr %ptr4
   ret void
 }
+
+; Test that a truncating store with a non-simple VT can be handled.
+define void @f8(ptr %src, ptr %dst) {
+; CHECK-LABEL: f8:
+  %1 = load <12 x i32>, ptr %src, align 64
+  %2 = extractelement <12 x i32> %1, i64 11
+  %3 = trunc i32 %2 to i16
+  store i16 %3, ptr %dst, align 2
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/known-bits.ll b/llvm/test/CodeGen/X86/known-bits.ll
index 9741f6f..f7b2538 100644
--- a/llvm/test/CodeGen/X86/known-bits.ll
+++ b/llvm/test/CodeGen/X86/known-bits.ll
@@ -115,7 +115,7 @@ define i128 @knownbits_mask_addc_shl(i64 %a0, i64 %a1, i64 %a2) nounwind {
 ; X64-NEXT:    andq $-1024, %rdi # imm = 0xFC00
 ; X64-NEXT:    andq $-1024, %rsi # imm = 0xFC00
 ; X64-NEXT:    addq %rdi, %rsi
-; X64-NEXT:    adcq $0, %rdx
+; X64-NEXT:    adcl $0, %edx
 ; X64-NEXT:    shldq $54, %rsi, %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
index 8110dd4..5da6832 100644
--- a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
@@ -737,6 +737,29 @@ entry:
   ret void
 }
 
+define void @freeze_vector_insert(<2 x float> %vec, i32 %idx, float %scalar) sanitize_numerical_stability {
+; CHECK-LABEL: @freeze_vector_insert(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @freeze_vector_insert to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr @__nsan_shadow_args_ptr, align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext <2 x float> [[VEC:%.*]] to <2 x double>
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr getelementptr ([16384 x i8], ptr @__nsan_shadow_args_ptr, i64 0, i64 16), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fpext float [[SCALAR:%.*]] to double
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP1]], double [[TMP5]], double [[TMP6]]
+; CHECK-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[VEC]], float [[SCALAR]], i32 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP7]], i32 [[IDX]]
+; CHECK-NEXT:    [[FROZEN:%.*]] = freeze <2 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = freeze <2 x double> [[TMP9]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = insertelement <2 x float> %vec, float %scalar, i32 %idx
+  %frozen = freeze <2 x float> %1
+  ret void
+}
 
 define void @vector_shuffle(<2 x float> %0) sanitize_numerical_stability {
 ; CHECK-LABEL: @vector_shuffle(
diff --git a/llvm/test/MC/LoongArch/Basic/Integer/invalid.s b/llvm/test/MC/LoongArch/Basic/Integer/invalid.s
index 958d5ca..08a131d 100644
--- a/llvm/test/MC/LoongArch/Basic/Integer/invalid.s
+++ b/llvm/test/MC/LoongArch/Basic/Integer/invalid.s
@@ -99,11 +99,13 @@ jirl $a0, $a0, 0x20000
 # CHECK: :[[#@LINE-1]]:16: error: operand must be a symbol with modifier (e.g. %b16) or an integer in the range [-131072, 131068]
 
 ## simm20
-pcaddi $a0, -0x80001
-# CHECK: :[[#@LINE-1]]:13: error: immediate must be an integer in the range [-524288, 524287]
 pcaddu12i $a0, 0x80000
 # CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-524288, 524287]
 
+## simm20_pcaddi
+pcaddi $a0, -0x80001
+# CHECK: :[[#@LINE-1]]:13: error: operand must be a symbol with modifier (e.g. %pcrel_20) or an integer in the range [-524288, 524287]
+
 ## simm20_lu12iw
 lu12i.w $a0, -0x80001
 # CHECK: :[[#@LINE-1]]:14: error: operand must be a symbol with modifier (e.g. %abs_hi20) or an integer in the range [-524288, 524287]
diff --git a/llvm/test/MC/LoongArch/Relocations/relocations.s b/llvm/test/MC/LoongArch/Relocations/relocations.s
index e83b671..091dce2 100644
--- a/llvm/test/MC/LoongArch/Relocations/relocations.s
+++ b/llvm/test/MC/LoongArch/Relocations/relocations.s
@@ -288,3 +288,23 @@ addi.d $t1, $a2, %le_lo12_r(foo)
 # RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0
 # INSTR: addi.d $t1, $a2, %le_lo12_r(foo)
 # FIXUP: fixup A - offset: 0, value: %le_lo12_r(foo), kind: FK_NONE
+
+pcaddi $t1, %pcrel_20(foo)
+# RELOC: R_LARCH_PCREL20_S2 foo 0x0
+# INSTR: pcaddi $t1, %pcrel_20(foo)
+# FIXUP: fixup A - offset: 0, value: %pcrel_20(foo), kind: FK_NONE
+
+pcaddi $t1, %ld_pcrel_20(foo)
+# RELOC: R_LARCH_TLS_LD_PCREL20_S2 foo 0x0
+# INSTR: pcaddi $t1, %ld_pcrel_20(foo)
+# FIXUP: fixup A - offset: 0, value: %ld_pcrel_20(foo), kind: FK_NONE
+
+pcaddi $t1, %gd_pcrel_20(foo)
+# RELOC: R_LARCH_TLS_GD_PCREL20_S2 foo 0x0
+# INSTR: pcaddi $t1, %gd_pcrel_20(foo)
+# FIXUP: fixup A - offset: 0, value: %gd_pcrel_20(foo), kind: FK_NONE
+
+pcaddi $t1, %desc_pcrel_20(foo)
+# RELOC: R_LARCH_TLS_DESC_PCREL20_S2 foo 0x0
+# INSTR: pcaddi $t1, %desc_pcrel_20(foo)
+# FIXUP: fixup A - offset: 0, value: %desc_pcrel_20(foo), kind: FK_NONE
diff --git a/llvm/test/Transforms/LICM/hoist-binop.ll b/llvm/test/Transforms/LICM/hoist-binop.ll
new file mode 100644
index 0000000..8bda74e
--- /dev/null
+++ b/llvm/test/Transforms/LICM/hoist-binop.ll
@@ -0,0 +1,226 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=licm < %s | FileCheck %s
+
+; Fold ADD and remove old op if unused.
+define void @add_one_use(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_one_use(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add i64 %index, %c1
+  %index.next = add i64 %step.add, %c2
+  br label %loop
+}
+
+; Fold ADD and copy NUW if both ops have it.
+; https://alive2.llvm.org/ce/z/bPAT7Z
+define void @add_nuw(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_nuw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add nuw i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nuw i64 [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add nuw i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add nuw i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = add nuw i64 %step.add, %c2
+  br label %loop
+}
+
+; Fold ADD but don't copy NUW if only one op has it.
+define void @add_no_nuw(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_no_nuw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add i64 [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = add nuw i64 %step.add, %c2
+  br label %loop
+}
+
+; Fold ADD but don't copy NSW if one op has it.
+define void @add_no_nsw(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_no_nsw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add i64 [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = add nsw i64 %step.add, %c2
+  br label %loop
+}
+
+; Fold ADD but don't copy NSW even if both ops have it.
+define void @add_no_nsw_2(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_no_nsw_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nsw i64 [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add nsw i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = add nsw i64 %step.add, %c2
+  br label %loop
+}
+
+; Don't fold if the ops are different (even if they are both associative).
+define void @diff_ops(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @diff_ops(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add i64 [[INDEX]], [[C1:%.*]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = mul i64 [[STEP_ADD]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = mul i64 %step.add, %c2
+  br label %loop
+}
+
+; Don't fold if the ops are not associative.
+define void @noassoc_ops(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @noassoc_ops(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = sub i64 [[INDEX]], [[C1:%.*]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = sub i64 [[STEP_ADD]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = sub i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = sub i64 %step.add, %c2
+  br label %loop
+}
+
+; Don't fold floating-point ops, even if they are associative. This would be
+; valid, but is currently disabled.
+define void @fadd(float %c1, float %c2) {
+; CHECK-LABEL: @fadd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = fadd fast float [[INDEX]], [[C1:%.*]]
+; CHECK-NEXT:    call void @use(float [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = fadd fast float [[STEP_ADD]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi float [ 0., %entry ], [ %index.next, %loop ]
+  %step.add = fadd fast float %index, %c1
+  call void @use(float %step.add)
+  %index.next = fadd fast float %step.add, %c2
+  br label %loop
+}
+
+; Original reproducer, adapted from:
+;   for(long i = 0; i < n; ++i)
+;     a[i] = (i*k) * v;
+define void @test(i64 %n, i64 %k) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[K_2:%.*]] = shl nuw nsw i64 [[K:%.*]], 1
+; CHECK-NEXT:    [[VEC_INIT:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[K]], i64 1
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[K_2]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add <2 x i64> [[DOTSPLAT]], [[DOTSPLAT]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ [[VEC_INIT]], [[ENTRY:%.*]] ], [ [[VEC_IND_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    call void @use(<2 x i64> [[STEP_ADD]])
+; CHECK-NEXT:    [[VEC_IND_NEXT_REASS]] = add <2 x i64> [[VEC_IND]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  %k.2 = shl nuw nsw i64 %k, 1
+  %vec.init = insertelement <2 x i64> zeroinitializer, i64 %k, i64 1
+  %.splatinsert = insertelement <2 x i64> poison, i64 %k.2, i64 0
+  %.splat = shufflevector <2 x i64> %.splatinsert, <2 x i64> poison, <2 x i32> zeroinitializer
+  br label %loop
+
+loop:
+  %vec.ind = phi <2 x i64> [ %vec.init, %entry ], [ %vec.ind.next, %loop ]
+  %step.add = add <2 x i64> %vec.ind, %.splat
+  call void @use(<2 x i64> %step.add)
+  %vec.ind.next = add <2 x i64> %step.add, %.splat
+  br label %loop
+}
+
+declare void @use()
diff --git a/llvm/test/Transforms/LICM/sink-foldable.ll b/llvm/test/Transforms/LICM/sink-foldable.ll
index 38577a5..36e2eab 100644
--- a/llvm/test/Transforms/LICM/sink-foldable.ll
+++ b/llvm/test/Transforms/LICM/sink-foldable.ll
@@ -77,9 +77,10 @@ return:
 define ptr @test2(i32 %j, ptr readonly %P, ptr readnone %Q) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add i32 1, 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond:
-; CHECK-NEXT:    [[I_ADDR_0:%.*]] = phi i32 [ [[ADD:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[I_ADDR_0:%.*]] = phi i32 [ [[ADD_REASS:%.*]], [[IF_END:%.*]] ]
 ; CHECK-NEXT:    [[P_ADDR_0:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[IF_END]] ]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_ADDR_0]], [[J:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[LOOPEXIT0:%.*]]
@@ -97,7 +98,7 @@ define ptr @test2(i32 %j, ptr readonly %P, ptr readnone %Q) {
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[ADD_PTR]], i64 [[IDX2_EXT]]
 ; CHECK-NEXT:    [[L1:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt ptr [[L1]], [[Q]]
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[ADD_I]], 1
+; CHECK-NEXT:    [[ADD_REASS]] = add i32 [[I_ADDR]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[LOOPEXIT2:%.*]], label [[FOR_COND]]
 ; CHECK:       loopexit0:
 ; CHECK-NEXT:    [[P0:%.*]] = phi ptr [ null, [[FOR_COND]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 3a14842..9c9547c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -135,8 +135,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Not Interleaving.
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
-; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
+; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF>=1' {
 ; CHECK-NEXT:  Live-in vp<%0> = VF * UF
@@ -340,8 +340,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Not Interleaving.
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
-; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
+; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF>=1' {
 ; CHECK-NEXT:  Live-in vp<%0> = VF * UF
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
index 4dea523..31f0e06 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
@@ -6,23 +6,19 @@ define i16 @test() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 5
 ; CHECK-NEXT:    [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6
-; CHECK-NEXT:    [[A2:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 7
 ; CHECK-NEXT:    br label [[WHILE:%.*]]
 ; CHECK:       while:
-; CHECK-NEXT:    [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX5:%.*]], [[WHILE]] ]
+; CHECK-NEXT:    [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX26:%.*]], [[WHILE]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr null, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A1]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr null, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i64> [[TMP5]], i64 [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i64> [[TMP6]], i64 [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 4, i32 5, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> [[TMP10]])
-; CHECK-NEXT:    [[OP_RDX5]] = xor i64 [[TMP3]], [[TMP11]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr null, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[TMP4]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = xor i64 0, [[TMP2]]
+; CHECK-NEXT:    [[OP_RDX24:%.*]] = xor i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[OP_RDX25:%.*]] = xor i64 [[OP_RDX]], [[OP_RDX24]]
+; CHECK-NEXT:    [[OP_RDX26]] = xor i64 [[OP_RDX25]], [[TMP5]]
 ; CHECK-NEXT:    br label [[WHILE]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
index 8f1d7a1..69ecf18 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
@@ -7,9 +7,11 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7
 ; CHECK-NEXT:    [[T5:%.*]] = load i32, ptr [[T4]], align 4
 ; CHECK-NEXT:    [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1
+; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
 ; CHECK-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6
 ; CHECK-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
+; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2
+; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
 ; CHECK-NEXT:    [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5
 ; CHECK-NEXT:    [[T17:%.*]] = load i32, ptr [[T16]], align 4
 ; CHECK-NEXT:    [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3
@@ -19,11 +21,10 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
-; CHECK-NEXT:    [[T9:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[T15:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
 ; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
 ; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
+; CHECK-NEXT:    [[T32:%.*]] = mul nsw i32 [[T27]], 6270
 ; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
 ; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
 ; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
@@ -33,19 +34,20 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T27]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T47]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, <4 x i32> <i32 1, i32 0, i32 6, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T40]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[T50:%.*]] = add nsw i32 [[T40]], [[T48]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 poison, i32 poison, i32 3>
-; CHECK-NEXT:    [[T701:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[T50]], i32 5
+; CHECK-NEXT:    [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2
+; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6
-; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T71]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
+; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    store <8 x i32> [[T76]], ptr [[T2]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SimplifyCFG/AMDGPU/skip-threading.ll b/llvm/test/Transforms/SimplifyCFG/AMDGPU/skip-threading.ll
new file mode 100644
index 0000000..b1262e2
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/AMDGPU/skip-threading.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn -S -passes=simplifycfg < %s | FileCheck %s
+
+declare void @bar1()
+declare void @bar2()
+declare void @bar3()
+
+define i32 @test_01a(i32 %a) {
+; CHECK-LABEL: define i32 @test_01a(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[MERGE:.*]], label %[[IF_FALSE:.*]]
+; CHECK:       [[IF_FALSE]]:
+; CHECK-NEXT:    call void @bar1()
+; CHECK-NEXT:    br label %[[MERGE]]
+; CHECK:       [[MERGE]]:
+; CHECK-NEXT:    call void @bar2()
+; CHECK-NEXT:    br i1 [[COND]], label %[[EXIT:.*]], label %[[IF_FALSE_2:.*]]
+; CHECK:       [[IF_FALSE_2]]:
+; CHECK-NEXT:    call void @bar3()
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[A]]
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %merge, label %if.false
+
+if.false:
+  call void @bar1()
+  br label %merge
+
+merge:
+  call void @bar2()
+  br i1 %cond, label %exit, label %if.false.2
+
+if.false.2:
+  call void @bar3()
+  br label %exit
+
+exit:
+  ret i32 %a
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/convergent.ll b/llvm/test/Transforms/SimplifyCFG/convergent.ll
index 6ba51e0..d148063 100644
--- a/llvm/test/Transforms/SimplifyCFG/convergent.ll
+++ b/llvm/test/Transforms/SimplifyCFG/convergent.ll
@@ -4,6 +4,9 @@
 ; RUN: opt -S -passes='simplifycfg<hoist-common-insts;sink-common-insts>' < %s | FileCheck -check-prefixes=CHECK,SINK %s
 
 declare void @foo() convergent
+declare void @bar1()
+declare void @bar2()
+declare void @bar3()
 declare i32 @tid()
 declare i32 @mbcnt(i32 %a, i32 %b) convergent
 declare i32 @bpermute(i32 %a, i32 %b) convergent
@@ -45,6 +48,42 @@ exit:
   ret i32 %a
 }
 
+define i32 @test_01a(i32 %a) {
+; CHECK-LABEL: @test_01a(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT_CRITEDGE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    call void @bar1()
+; CHECK-NEXT:    call void @bar2()
+; CHECK-NEXT:    call void @bar3()
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit.critedge:
+; CHECK-NEXT:    call void @bar2()
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 [[A]]
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %merge, label %if.false
+
+if.false:
+  call void @bar1()
+  br label %merge
+
+merge:
+  call void @bar2()
+  br i1 %cond, label %exit, label %if.false.2
+
+if.false.2:
+  call void @bar3()
+  br label %exit
+
+exit:
+  ret i32 %a
+}
+
 define void @test_02(ptr %y.coerce) convergent {
 ; NOSINK-LABEL: @test_02(
 ; NOSINK-NEXT:  entry:
diff --git a/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s
index 904454a..df0053a 100644
--- a/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s
@@ -624,8 +624,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubb	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubd	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        psubq	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   psubq	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        psubq	%mm0, %mm2
+# CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %mm2
 # CHECK-NEXT:  1      1     0.50                        psubq	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubsb	%xmm0, %xmm2
@@ -687,7 +687,7 @@ xorpd       (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     172.00 75.83  118.33 17.00  100.83 67.00  67.00
+# CHECK-NEXT:  -     172.00 75.83  117.33 17.00  101.83 67.00  67.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -908,8 +908,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubb	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubd	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubq	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %mm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubsb	%xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Barcelona/zero-idioms.s b/llvm/test/tools/llvm-mca/X86/Barcelona/zero-idioms.s
index 71902fe..54ff013 100644
--- a/llvm/test/tools/llvm-mca/X86/Barcelona/zero-idioms.s
+++ b/llvm/test/tools/llvm-mca/X86/Barcelona/zero-idioms.s
@@ -49,13 +49,13 @@ pxor   %xmm2, %xmm2
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      35
-# CHECK-NEXT: Total Cycles:      39
+# CHECK-NEXT: Total Cycles:      37
 # CHECK-NEXT: Total uOps:        35
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.90
-# CHECK-NEXT: IPC:               0.90
-# CHECK-NEXT: Block RThroughput: 11.0
+# CHECK-NEXT: uOps Per Cycle:    0.95
+# CHECK-NEXT: IPC:               0.95
+# CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -79,7 +79,7 @@ pxor   %xmm2, %xmm2
 # CHECK-NEXT:  1      0     0.25                        pcmpgtw	%xmm2, %xmm2
 # CHECK-NEXT:  1      3     1.00                        psubb	%mm2, %mm2
 # CHECK-NEXT:  1      3     1.00                        psubd	%mm2, %mm2
-# CHECK-NEXT:  1      3     1.00                        psubq	%mm2, %mm2
+# CHECK-NEXT:  1      1     0.50                        psubq	%mm2, %mm2
 # CHECK-NEXT:  1      3     1.00                        psubw	%mm2, %mm2
 # CHECK-NEXT:  1      0     0.25                        psubb	%xmm2, %xmm2
 # CHECK-NEXT:  1      0     0.25                        psubd	%xmm2, %xmm2
@@ -118,7 +118,7 @@ pxor   %xmm2, %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     2.00   12.00   -     6.00    -      -
+# CHECK-NEXT:  -      -     3.00   11.00   -     6.00    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -135,7 +135,7 @@ pxor   %xmm2, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     pcmpgtw	%xmm2, %xmm2
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubb	%mm2, %mm2
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubd	%mm2, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubq	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psubq	%mm2, %mm2
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubw	%mm2, %mm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     psubb	%xmm2, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     psubd	%xmm2, %xmm2
@@ -155,48 +155,48 @@ pxor   %xmm2, %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pandn	%xmm2, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     xorpd	%xmm1, %xmm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     pxor	%mm2, %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pxor	%mm2, %mm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     pxor	%xmm2, %xmm2
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT:                     0123456789          0123456
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DR   .    .    .    .    .    .    .  .   subl	%eax, %eax
-# CHECK-NEXT: [0,1]     DR   .    .    .    .    .    .    .  .   subq	%rax, %rax
-# CHECK-NEXT: [0,2]     DR   .    .    .    .    .    .    .  .   xorl	%eax, %eax
-# CHECK-NEXT: [0,3]     DR   .    .    .    .    .    .    .  .   xorq	%rax, %rax
-# CHECK-NEXT: [0,4]     .DeeeER   .    .    .    .    .    .  .   pcmpgtb	%mm2, %mm2
-# CHECK-NEXT: [0,5]     .D===eeeER.    .    .    .    .    .  .   pcmpgtd	%mm2, %mm2
-# CHECK-NEXT: [0,6]     .D======eeeER  .    .    .    .    .  .   pcmpgtw	%mm2, %mm2
-# CHECK-NEXT: [0,7]     .D----------R  .    .    .    .    .  .   pcmpgtb	%xmm2, %xmm2
-# CHECK-NEXT: [0,8]     . D---------R  .    .    .    .    .  .   pcmpgtd	%xmm2, %xmm2
-# CHECK-NEXT: [0,9]     . D---------R  .    .    .    .    .  .   pcmpgtq	%xmm2, %xmm2
-# CHECK-NEXT: [0,10]    . D---------R  .    .    .    .    .  .   pcmpgtw	%xmm2, %xmm2
-# CHECK-NEXT: [0,11]    . D========eeeER    .    .    .    .  .   psubb	%mm2, %mm2
-# CHECK-NEXT: [0,12]    .  D==========eeeER .    .    .    .  .   psubd	%mm2, %mm2
-# CHECK-NEXT: [0,13]    .  D=============eeeER   .    .    .  .   psubq	%mm2, %mm2
-# CHECK-NEXT: [0,14]    .  D================eeeER.    .    .  .   psubw	%mm2, %mm2
-# CHECK-NEXT: [0,15]    .  D--------------------R.    .    .  .   psubb	%xmm2, %xmm2
-# CHECK-NEXT: [0,16]    .   D-------------------R.    .    .  .   psubd	%xmm2, %xmm2
-# CHECK-NEXT: [0,17]    .   D-------------------R.    .    .  .   psubq	%xmm2, %xmm2
-# CHECK-NEXT: [0,18]    .   D-------------------R.    .    .  .   psubw	%xmm2, %xmm2
-# CHECK-NEXT: [0,19]    .   D==================eeeER  .    .  .   psubsb	%mm2, %mm2
-# CHECK-NEXT: [0,20]    .    D====================eeeER    .  .   psubsw	%mm2, %mm2
-# CHECK-NEXT: [0,21]    .    DeE----------------------R    .  .   psubsb	%xmm2, %xmm2
-# CHECK-NEXT: [0,22]    .    D=eE---------------------R    .  .   psubsw	%xmm2, %xmm2
-# CHECK-NEXT: [0,23]    .    D=======================eeeER .  .   psubusb	%mm2, %mm2
-# CHECK-NEXT: [0,24]    .    .D=========================eeeER .   psubusw	%mm2, %mm2
-# CHECK-NEXT: [0,25]    .    .D=eE--------------------------R .   psubusb	%xmm2, %xmm2
-# CHECK-NEXT: [0,26]    .    .D==eE-------------------------R .   psubusw	%xmm2, %xmm2
-# CHECK-NEXT: [0,27]    .    .D==eE-------------------------R .   andnps	%xmm0, %xmm0
-# CHECK-NEXT: [0,28]    .    . D==eE------------------------R .   andnpd	%xmm1, %xmm1
-# CHECK-NEXT: [0,29]    .    . D===========================eER.   pandn	%mm2, %mm2
-# CHECK-NEXT: [0,30]    .    . D==eE-------------------------R.   pandn	%xmm2, %xmm2
-# CHECK-NEXT: [0,31]    .    . D-----------------------------R.   xorps	%xmm0, %xmm0
-# CHECK-NEXT: [0,32]    .    .  D----------------------------R.   xorpd	%xmm1, %xmm1
-# CHECK-NEXT: [0,33]    .    .  D===========================eER   pxor	%mm2, %mm2
-# CHECK-NEXT: [0,34]    .    .  D-----------------------------R   pxor	%xmm2, %xmm2
+# CHECK:      [0,0]     DR   .    .    .    .    .    .    ..   subl	%eax, %eax
+# CHECK-NEXT: [0,1]     DR   .    .    .    .    .    .    ..   subq	%rax, %rax
+# CHECK-NEXT: [0,2]     DR   .    .    .    .    .    .    ..   xorl	%eax, %eax
+# CHECK-NEXT: [0,3]     DR   .    .    .    .    .    .    ..   xorq	%rax, %rax
+# CHECK-NEXT: [0,4]     .DeeeER   .    .    .    .    .    ..   pcmpgtb	%mm2, %mm2
+# CHECK-NEXT: [0,5]     .D===eeeER.    .    .    .    .    ..   pcmpgtd	%mm2, %mm2
+# CHECK-NEXT: [0,6]     .D======eeeER  .    .    .    .    ..   pcmpgtw	%mm2, %mm2
+# CHECK-NEXT: [0,7]     .D----------R  .    .    .    .    ..   pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT: [0,8]     . D---------R  .    .    .    .    ..   pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT: [0,9]     . D---------R  .    .    .    .    ..   pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT: [0,10]    . D---------R  .    .    .    .    ..   pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT: [0,11]    . D========eeeER    .    .    .    ..   psubb	%mm2, %mm2
+# CHECK-NEXT: [0,12]    .  D==========eeeER .    .    .    ..   psubd	%mm2, %mm2
+# CHECK-NEXT: [0,13]    .  D=============eER.    .    .    ..   psubq	%mm2, %mm2
+# CHECK-NEXT: [0,14]    .  D==============eeeER  .    .    ..   psubw	%mm2, %mm2
+# CHECK-NEXT: [0,15]    .  D------------------R  .    .    ..   psubb	%xmm2, %xmm2
+# CHECK-NEXT: [0,16]    .   D-----------------R  .    .    ..   psubd	%xmm2, %xmm2
+# CHECK-NEXT: [0,17]    .   D-----------------R  .    .    ..   psubq	%xmm2, %xmm2
+# CHECK-NEXT: [0,18]    .   D-----------------R  .    .    ..   psubw	%xmm2, %xmm2
+# CHECK-NEXT: [0,19]    .   D================eeeER    .    ..   psubsb	%mm2, %mm2
+# CHECK-NEXT: [0,20]    .    D==================eeeER .    ..   psubsw	%mm2, %mm2
+# CHECK-NEXT: [0,21]    .    DeE--------------------R .    ..   psubsb	%xmm2, %xmm2
+# CHECK-NEXT: [0,22]    .    D=eE-------------------R .    ..   psubsw	%xmm2, %xmm2
+# CHECK-NEXT: [0,23]    .    D=====================eeeER   ..   psubusb	%mm2, %mm2
+# CHECK-NEXT: [0,24]    .    .D=======================eeeER..   psubusw	%mm2, %mm2
+# CHECK-NEXT: [0,25]    .    .D=eE------------------------R..   psubusb	%xmm2, %xmm2
+# CHECK-NEXT: [0,26]    .    .D==eE-----------------------R..   psubusw	%xmm2, %xmm2
+# CHECK-NEXT: [0,27]    .    .D==eE-----------------------R..   andnps	%xmm0, %xmm0
+# CHECK-NEXT: [0,28]    .    . D==eE----------------------R..   andnpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,29]    .    . D=========================eER.   pandn	%mm2, %mm2
+# CHECK-NEXT: [0,30]    .    . D==eE-----------------------R.   pandn	%xmm2, %xmm2
+# CHECK-NEXT: [0,31]    .    . D---------------------------R.   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [0,32]    .    .  D--------------------------R.   xorpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,33]    .    .  D=========================eER   pxor	%mm2, %mm2
+# CHECK-NEXT: [0,34]    .    .  D---------------------------R   pxor	%xmm2, %xmm2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -219,25 +219,25 @@ pxor   %xmm2, %xmm2
 # CHECK-NEXT: 11.    1     9.0    0.0    0.0       psubb	%mm2, %mm2
 # CHECK-NEXT: 12.    1     11.0   0.0    0.0       psubd	%mm2, %mm2
 # CHECK-NEXT: 13.    1     14.0   0.0    0.0       psubq	%mm2, %mm2
-# CHECK-NEXT: 14.    1     17.0   0.0    0.0       psubw	%mm2, %mm2
-# CHECK-NEXT: 15.    1     0.0    0.0    20.0      psubb	%xmm2, %xmm2
-# CHECK-NEXT: 16.    1     0.0    0.0    19.0      psubd	%xmm2, %xmm2
-# CHECK-NEXT: 17.    1     0.0    0.0    19.0      psubq	%xmm2, %xmm2
-# CHECK-NEXT: 18.    1     0.0    0.0    19.0      psubw	%xmm2, %xmm2
-# CHECK-NEXT: 19.    1     19.0   0.0    0.0       psubsb	%mm2, %mm2
-# CHECK-NEXT: 20.    1     21.0   0.0    0.0       psubsw	%mm2, %mm2
-# CHECK-NEXT: 21.    1     1.0    1.0    22.0      psubsb	%xmm2, %xmm2
-# CHECK-NEXT: 22.    1     2.0    0.0    21.0      psubsw	%xmm2, %xmm2
-# CHECK-NEXT: 23.    1     24.0   0.0    0.0       psubusb	%mm2, %mm2
-# CHECK-NEXT: 24.    1     26.0   0.0    0.0       psubusw	%mm2, %mm2
-# CHECK-NEXT: 25.    1     2.0    0.0    26.0      psubusb	%xmm2, %xmm2
-# CHECK-NEXT: 26.    1     3.0    0.0    25.0      psubusw	%xmm2, %xmm2
-# CHECK-NEXT: 27.    1     3.0    3.0    25.0      andnps	%xmm0, %xmm0
-# CHECK-NEXT: 28.    1     3.0    3.0    24.0      andnpd	%xmm1, %xmm1
-# CHECK-NEXT: 29.    1     28.0   0.0    0.0       pandn	%mm2, %mm2
-# CHECK-NEXT: 30.    1     3.0    0.0    25.0      pandn	%xmm2, %xmm2
-# CHECK-NEXT: 31.    1     0.0    0.0    29.0      xorps	%xmm0, %xmm0
-# CHECK-NEXT: 32.    1     0.0    0.0    28.0      xorpd	%xmm1, %xmm1
-# CHECK-NEXT: 33.    1     28.0   0.0    0.0       pxor	%mm2, %mm2
-# CHECK-NEXT: 34.    1     0.0    0.0    29.0      pxor	%xmm2, %xmm2
-# CHECK-NEXT:        1     6.5    0.2    10.5      <total>
+# CHECK-NEXT: 14.    1     15.0   0.0    0.0       psubw	%mm2, %mm2
+# CHECK-NEXT: 15.    1     0.0    0.0    18.0      psubb	%xmm2, %xmm2
+# CHECK-NEXT: 16.    1     0.0    0.0    17.0      psubd	%xmm2, %xmm2
+# CHECK-NEXT: 17.    1     0.0    0.0    17.0      psubq	%xmm2, %xmm2
+# CHECK-NEXT: 18.    1     0.0    0.0    17.0      psubw	%xmm2, %xmm2
+# CHECK-NEXT: 19.    1     17.0   0.0    0.0       psubsb	%mm2, %mm2
+# CHECK-NEXT: 20.    1     19.0   0.0    0.0       psubsw	%mm2, %mm2
+# CHECK-NEXT: 21.    1     1.0    1.0    20.0      psubsb	%xmm2, %xmm2
+# CHECK-NEXT: 22.    1     2.0    0.0    19.0      psubsw	%xmm2, %xmm2
+# CHECK-NEXT: 23.    1     22.0   0.0    0.0       psubusb	%mm2, %mm2
+# CHECK-NEXT: 24.    1     24.0   0.0    0.0       psubusw	%mm2, %mm2
+# CHECK-NEXT: 25.    1     2.0    0.0    24.0      psubusb	%xmm2, %xmm2
+# CHECK-NEXT: 26.    1     3.0    0.0    23.0      psubusw	%xmm2, %xmm2
+# CHECK-NEXT: 27.    1     3.0    3.0    23.0      andnps	%xmm0, %xmm0
+# CHECK-NEXT: 28.    1     3.0    3.0    22.0      andnpd	%xmm1, %xmm1
+# CHECK-NEXT: 29.    1     26.0   0.0    0.0       pandn	%mm2, %mm2
+# CHECK-NEXT: 30.    1     3.0    0.0    23.0      pandn	%xmm2, %xmm2
+# CHECK-NEXT: 31.    1     0.0    0.0    27.0      xorps	%xmm0, %xmm0
+# CHECK-NEXT: 32.    1     0.0    0.0    26.0      xorpd	%xmm1, %xmm1
+# CHECK-NEXT: 33.    1     26.0   0.0    0.0       pxor	%mm2, %mm2
+# CHECK-NEXT: 34.    1     0.0    0.0    27.0      pxor	%xmm2, %xmm2
+# CHECK-NEXT:        1     6.1    0.2    9.7       <total>
diff --git a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-mmx.s b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-mmx.s
index 69491f0..53b9d22 100644
--- a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-mmx.s
@@ -173,11 +173,11 @@ pxor        (%rax), %mm2
 # CHECK-NEXT:  1      5     0.50    *                   movq	(%rax), %mm2
 # CHECK-NEXT:  1      1     1.00                        movq	%mm0, %rcx
 # CHECK-NEXT:  2      1     1.00           *            movq	%mm0, (%rax)
-# CHECK-NEXT:  3      3     2.00                        packsswb	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packsswb	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packsswb	(%rax), %mm2
-# CHECK-NEXT:  3      3     2.00                        packssdw	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packssdw	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packssdw	(%rax), %mm2
-# CHECK-NEXT:  3      3     2.00                        packuswb	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packuswb	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packuswb	(%rax), %mm2
 # CHECK-NEXT:  1      1     0.50                        paddb	%mm0, %mm2
 # CHECK-NEXT:  2      6     0.50    *                   paddb	(%rax), %mm2
@@ -284,7 +284,7 @@ pxor        (%rax), %mm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     46.67  35.17  23.67  23.67  2.00   57.17  1.00   0.67
+# CHECK-NEXT:  -      -     45.92  34.42  23.67  23.67  2.00   56.42  0.25   0.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -297,11 +297,11 @@ pxor        (%rax), %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     movq	(%rax), %mm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     movq	%mm0, %rcx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   movq	%mm0, (%rax)
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packsswb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packsswb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packsswb	(%rax), %mm2
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packssdw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packssdw	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packssdw	(%rax), %mm2
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packuswb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packuswb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packuswb	(%rax), %mm2
 # CHECK-NEXT:  -      -      -     0.50    -      -      -     0.50    -      -     paddb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -     0.50    -      -     paddb	(%rax), %mm2
diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-sse2.s
index 904454a..df0053a 100644
--- a/llvm/test/tools/llvm-mca/X86/Generic/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-sse2.s
@@ -624,8 +624,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubb	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubd	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        psubq	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   psubq	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        psubq	%mm0, %mm2
+# CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %mm2
 # CHECK-NEXT:  1      1     0.50                        psubq	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubsb	%xmm0, %xmm2
@@ -687,7 +687,7 @@ xorpd       (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     172.00 75.83  118.33 17.00  100.83 67.00  67.00
+# CHECK-NEXT:  -     172.00 75.83  117.33 17.00  101.83 67.00  67.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -908,8 +908,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubb	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubd	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubq	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %mm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubsb	%xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/resources-mmx.s b/llvm/test/tools/llvm-mca/X86/Haswell/resources-mmx.s
index 5094dd1..01f516a 100644
--- a/llvm/test/tools/llvm-mca/X86/Haswell/resources-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Haswell/resources-mmx.s
@@ -173,11 +173,11 @@ pxor        (%rax), %mm2
 # CHECK-NEXT:  1      5     0.50    *                   movq	(%rax), %mm2
 # CHECK-NEXT:  1      1     1.00                        movq	%mm0, %rcx
 # CHECK-NEXT:  2      1     1.00           *            movq	%mm0, (%rax)
-# CHECK-NEXT:  3      3     2.00                        packsswb	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packsswb	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packsswb	(%rax), %mm2
-# CHECK-NEXT:  3      3     2.00                        packssdw	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packssdw	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packssdw	(%rax), %mm2
-# CHECK-NEXT:  3      3     2.00                        packuswb	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packuswb	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packuswb	(%rax), %mm2
 # CHECK-NEXT:  1      1     0.50                        paddb	%mm0, %mm2
 # CHECK-NEXT:  2      6     0.50    *                   paddb	(%rax), %mm2
@@ -284,7 +284,7 @@ pxor        (%rax), %mm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     46.67  35.17  23.67  23.67  2.00   57.17  1.00   0.67
+# CHECK-NEXT:  -      -     45.92  34.42  23.67  23.67  2.00   56.42  0.25   0.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -297,11 +297,11 @@ pxor        (%rax), %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     movq	(%rax), %mm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     movq	%mm0, %rcx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   movq	%mm0, (%rax)
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packsswb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packsswb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packsswb	(%rax), %mm2
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packssdw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packssdw	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packssdw	(%rax), %mm2
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packuswb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packuswb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packuswb	(%rax), %mm2
 # CHECK-NEXT:  -      -      -     0.50    -      -      -     0.50    -      -     paddb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -     0.50    -      -     paddb	(%rax), %mm2
diff --git a/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s
index c3b8b73..e2cfd02 100644
--- a/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s
@@ -624,8 +624,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubb	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubd	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        psubq	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   psubq	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        psubq	%mm0, %mm2
+# CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %mm2
 # CHECK-NEXT:  1      1     0.50                        psubq	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubsb	%xmm0, %xmm2
@@ -687,7 +687,7 @@ xorpd       (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     172.00 75.83  118.33 17.00  100.83 67.00  67.00
+# CHECK-NEXT:  -     172.00 75.83  117.33 17.00  101.83 67.00  67.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -908,8 +908,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubb	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubd	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubq	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %mm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubsb	%xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/SandyBridge/zero-idioms.s b/llvm/test/tools/llvm-mca/X86/SandyBridge/zero-idioms.s
index 4a034cc..bdca772 100644
--- a/llvm/test/tools/llvm-mca/X86/SandyBridge/zero-idioms.s
+++ b/llvm/test/tools/llvm-mca/X86/SandyBridge/zero-idioms.s
@@ -83,12 +83,12 @@ vpxor  %xmm3, %xmm3, %xmm5
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      63
-# CHECK-NEXT: Total Cycles:      27
+# CHECK-NEXT: Total Cycles:      25
 # CHECK-NEXT: Total uOps:        63
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    2.33
-# CHECK-NEXT: IPC:               2.33
+# CHECK-NEXT: uOps Per Cycle:    2.52
+# CHECK-NEXT: IPC:               2.52
 # CHECK-NEXT: Block RThroughput: 15.8
 
 # CHECK:      Instruction Info:
@@ -121,7 +121,7 @@ vpxor  %xmm3, %xmm3, %xmm5
 # CHECK-NEXT:  1      0     0.25                        vpcmpgtw	%xmm3, %xmm3, %xmm5
 # CHECK-NEXT:  1      3     1.00                        psubb	%mm2, %mm2
 # CHECK-NEXT:  1      3     1.00                        psubd	%mm2, %mm2
-# CHECK-NEXT:  1      3     1.00                        psubq	%mm2, %mm2
+# CHECK-NEXT:  1      1     0.50                        psubq	%mm2, %mm2
 # CHECK-NEXT:  1      3     1.00                        psubw	%mm2, %mm2
 # CHECK-NEXT:  1      0     0.25                        psubb	%xmm2, %xmm2
 # CHECK-NEXT:  1      0     0.25                        psubd	%xmm2, %xmm2
@@ -250,71 +250,71 @@ vpxor  %xmm3, %xmm3, %xmm5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          01234
 
-# CHECK:      [0,0]     DR   .    .    .    .    ..   subl	%eax, %eax
-# CHECK-NEXT: [0,1]     DR   .    .    .    .    ..   subq	%rax, %rax
-# CHECK-NEXT: [0,2]     DR   .    .    .    .    ..   xorl	%eax, %eax
-# CHECK-NEXT: [0,3]     DR   .    .    .    .    ..   xorq	%rax, %rax
-# CHECK-NEXT: [0,4]     .DeeeER   .    .    .    ..   pcmpgtb	%mm2, %mm2
-# CHECK-NEXT: [0,5]     .D===eeeER.    .    .    ..   pcmpgtd	%mm2, %mm2
-# CHECK-NEXT: [0,6]     .D======eeeER  .    .    ..   pcmpgtw	%mm2, %mm2
-# CHECK-NEXT: [0,7]     .D----------R  .    .    ..   pcmpgtb	%xmm2, %xmm2
-# CHECK-NEXT: [0,8]     . D---------R  .    .    ..   pcmpgtd	%xmm2, %xmm2
-# CHECK-NEXT: [0,9]     . D---------R  .    .    ..   pcmpgtq	%xmm2, %xmm2
-# CHECK-NEXT: [0,10]    . D---------R  .    .    ..   pcmpgtw	%xmm2, %xmm2
-# CHECK-NEXT: [0,11]    . D---------R  .    .    ..   vpcmpgtb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,12]    .  D--------R  .    .    ..   vpcmpgtd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,13]    .  D--------R  .    .    ..   vpcmpgtq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,14]    .  D--------R  .    .    ..   vpcmpgtw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,15]    .  D--------R  .    .    ..   vpcmpgtb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,16]    .   D-------R  .    .    ..   vpcmpgtd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,17]    .   D-------R  .    .    ..   vpcmpgtq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,18]    .   D-------R  .    .    ..   vpcmpgtw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,19]    .   D======eeeER    .    ..   psubb	%mm2, %mm2
-# CHECK-NEXT: [0,20]    .    D========eeeER .    ..   psubd	%mm2, %mm2
-# CHECK-NEXT: [0,21]    .    D===========eeeER   ..   psubq	%mm2, %mm2
-# CHECK-NEXT: [0,22]    .    D==============eeeER..   psubw	%mm2, %mm2
-# CHECK-NEXT: [0,23]    .    D------------------R..   psubb	%xmm2, %xmm2
-# CHECK-NEXT: [0,24]    .    .D-----------------R..   psubd	%xmm2, %xmm2
-# CHECK-NEXT: [0,25]    .    .D-----------------R..   psubq	%xmm2, %xmm2
-# CHECK-NEXT: [0,26]    .    .D-----------------R..   psubw	%xmm2, %xmm2
-# CHECK-NEXT: [0,27]    .    .D-----------------R..   vpsubb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,28]    .    . D----------------R..   vpsubd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,29]    .    . D----------------R..   vpsubq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,30]    .    . D----------------R..   vpsubw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,31]    .    . D----------------R..   vpsubb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,32]    .    .  D---------------R..   vpsubd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,33]    .    .  D---------------R..   vpsubq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,34]    .    .  D---------------R..   vpsubw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,35]    .    .  DeE-------------R..   andnps	%xmm0, %xmm0
-# CHECK-NEXT: [0,36]    .    .   DeE------------R..   andnpd	%xmm1, %xmm1
-# CHECK-NEXT: [0,37]    .    .   D=eE-----------R..   vandnps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT: [0,38]    .    .   D===eE---------R..   vandnpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT: [0,39]    .    .   D==eE----------R..   vandnps	%ymm2, %ymm2, %ymm2
-# CHECK-NEXT: [0,40]    .    .    D===eE--------R..   vandnpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [0,41]    .    .    D============eER.   pandn	%mm2, %mm2
-# CHECK-NEXT: [0,42]    .    .    D==eE----------R.   pandn	%xmm2, %xmm2
-# CHECK-NEXT: [0,43]    .    .    DeE------------R.   vpandn	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,44]    .    .    .D===eE--------R.   vandnps	%xmm2, %xmm2, %xmm5
-# CHECK-NEXT: [0,45]    .    .    .D====eE-------R.   vandnpd	%xmm1, %xmm1, %xmm5
-# CHECK-NEXT: [0,46]    .    .    .DeE-----------R.   vpandn	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,47]    .    .    .D=====eE------R.   vandnps	%ymm2, %ymm2, %ymm5
-# CHECK-NEXT: [0,48]    .    .    . D=====eE-----R.   vandnpd	%ymm1, %ymm1, %ymm5
-# CHECK-NEXT: [0,49]    .    .    . D------------R.   xorps	%xmm0, %xmm0
-# CHECK-NEXT: [0,50]    .    .    . D------------R.   xorpd	%xmm1, %xmm1
-# CHECK-NEXT: [0,51]    .    .    . D------------R.   vxorps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT: [0,52]    .    .    .  D-----------R.   vxorpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT: [0,53]    .    .    .  D-----------R.   vxorps	%ymm2, %ymm2, %ymm2
-# CHECK-NEXT: [0,54]    .    .    .  D-----------R.   vxorpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [0,55]    .    .    .  D==========eER   pxor	%mm2, %mm2
-# CHECK-NEXT: [0,56]    .    .    .   D-----------R   pxor	%xmm2, %xmm2
-# CHECK-NEXT: [0,57]    .    .    .   D-----------R   vpxor	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,58]    .    .    .   D-----------R   vxorps	%xmm4, %xmm4, %xmm5
-# CHECK-NEXT: [0,59]    .    .    .   D-----------R   vxorpd	%xmm1, %xmm1, %xmm3
-# CHECK-NEXT: [0,60]    .    .    .    D----------R   vxorps	%ymm4, %ymm4, %ymm5
-# CHECK-NEXT: [0,61]    .    .    .    D----------R   vxorpd	%ymm1, %ymm1, %ymm3
-# CHECK-NEXT: [0,62]    .    .    .    D----------R   vpxor	%xmm3, %xmm3, %xmm5
+# CHECK:      [0,0]     DR   .    .    .    .   .   subl	%eax, %eax
+# CHECK-NEXT: [0,1]     DR   .    .    .    .   .   subq	%rax, %rax
+# CHECK-NEXT: [0,2]     DR   .    .    .    .   .   xorl	%eax, %eax
+# CHECK-NEXT: [0,3]     DR   .    .    .    .   .   xorq	%rax, %rax
+# CHECK-NEXT: [0,4]     .DeeeER   .    .    .   .   pcmpgtb	%mm2, %mm2
+# CHECK-NEXT: [0,5]     .D===eeeER.    .    .   .   pcmpgtd	%mm2, %mm2
+# CHECK-NEXT: [0,6]     .D======eeeER  .    .   .   pcmpgtw	%mm2, %mm2
+# CHECK-NEXT: [0,7]     .D----------R  .    .   .   pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT: [0,8]     . D---------R  .    .   .   pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT: [0,9]     . D---------R  .    .   .   pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT: [0,10]    . D---------R  .    .   .   pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT: [0,11]    . D---------R  .    .   .   vpcmpgtb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,12]    .  D--------R  .    .   .   vpcmpgtd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,13]    .  D--------R  .    .   .   vpcmpgtq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,14]    .  D--------R  .    .   .   vpcmpgtw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,15]    .  D--------R  .    .   .   vpcmpgtb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,16]    .   D-------R  .    .   .   vpcmpgtd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,17]    .   D-------R  .    .   .   vpcmpgtq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,18]    .   D-------R  .    .   .   vpcmpgtw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,19]    .   D======eeeER    .   .   psubb	%mm2, %mm2
+# CHECK-NEXT: [0,20]    .    D========eeeER .   .   psubd	%mm2, %mm2
+# CHECK-NEXT: [0,21]    .    D===========eER.   .   psubq	%mm2, %mm2
+# CHECK-NEXT: [0,22]    .    D============eeeER .   psubw	%mm2, %mm2
+# CHECK-NEXT: [0,23]    .    D----------------R .   psubb	%xmm2, %xmm2
+# CHECK-NEXT: [0,24]    .    .D---------------R .   psubd	%xmm2, %xmm2
+# CHECK-NEXT: [0,25]    .    .D---------------R .   psubq	%xmm2, %xmm2
+# CHECK-NEXT: [0,26]    .    .D---------------R .   psubw	%xmm2, %xmm2
+# CHECK-NEXT: [0,27]    .    .D---------------R .   vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,28]    .    . D--------------R .   vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,29]    .    . D--------------R .   vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,30]    .    . D--------------R .   vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,31]    .    . D--------------R .   vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,32]    .    .  D-------------R .   vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,33]    .    .  D-------------R .   vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,34]    .    .  D-------------R .   vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,35]    .    .  DeE-----------R .   andnps	%xmm0, %xmm0
+# CHECK-NEXT: [0,36]    .    .   DeE----------R .   andnpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,37]    .    .   D=eE---------R .   vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [0,38]    .    .   D===eE-------R .   vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [0,39]    .    .   D==eE--------R .   vandnps	%ymm2, %ymm2, %ymm2
+# CHECK-NEXT: [0,40]    .    .    D===eE------R .   vandnpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [0,41]    .    .    D==========eER.   pandn	%mm2, %mm2
+# CHECK-NEXT: [0,42]    .    .    D==eE--------R.   pandn	%xmm2, %xmm2
+# CHECK-NEXT: [0,43]    .    .    DeE----------R.   vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,44]    .    .    .D===eE------R.   vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT: [0,45]    .    .    .D====eE-----R.   vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT: [0,46]    .    .    .DeE---------R.   vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,47]    .    .    .D=====eE----R.   vandnps	%ymm2, %ymm2, %ymm5
+# CHECK-NEXT: [0,48]    .    .    . D=====eE---R.   vandnpd	%ymm1, %ymm1, %ymm5
+# CHECK-NEXT: [0,49]    .    .    . D----------R.   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [0,50]    .    .    . D----------R.   xorpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,51]    .    .    . D----------R.   vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [0,52]    .    .    .  D---------R.   vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [0,53]    .    .    .  D---------R.   vxorps	%ymm2, %ymm2, %ymm2
+# CHECK-NEXT: [0,54]    .    .    .  D---------R.   vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [0,55]    .    .    .  D========eER   pxor	%mm2, %mm2
+# CHECK-NEXT: [0,56]    .    .    .   D---------R   pxor	%xmm2, %xmm2
+# CHECK-NEXT: [0,57]    .    .    .   D---------R   vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,58]    .    .    .   D---------R   vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT: [0,59]    .    .    .   D---------R   vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT: [0,60]    .    .    .    D--------R   vxorps	%ymm4, %ymm4, %ymm5
+# CHECK-NEXT: [0,61]    .    .    .    D--------R   vxorpd	%ymm1, %ymm1, %ymm3
+# CHECK-NEXT: [0,62]    .    .    .    D--------R   vpxor	%xmm3, %xmm3, %xmm5
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -345,45 +345,45 @@ vpxor  %xmm3, %xmm3, %xmm5
 # CHECK-NEXT: 19.    1     7.0    0.0    0.0       psubb	%mm2, %mm2
 # CHECK-NEXT: 20.    1     9.0    0.0    0.0       psubd	%mm2, %mm2
 # CHECK-NEXT: 21.    1     12.0   0.0    0.0       psubq	%mm2, %mm2
-# CHECK-NEXT: 22.    1     15.0   0.0    0.0       psubw	%mm2, %mm2
-# CHECK-NEXT: 23.    1     0.0    0.0    18.0      psubb	%xmm2, %xmm2
-# CHECK-NEXT: 24.    1     0.0    0.0    17.0      psubd	%xmm2, %xmm2
-# CHECK-NEXT: 25.    1     0.0    0.0    17.0      psubq	%xmm2, %xmm2
-# CHECK-NEXT: 26.    1     0.0    0.0    17.0      psubw	%xmm2, %xmm2
-# CHECK-NEXT: 27.    1     0.0    0.0    17.0      vpsubb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 28.    1     0.0    0.0    16.0      vpsubd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 29.    1     0.0    0.0    16.0      vpsubq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 30.    1     0.0    0.0    16.0      vpsubw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 31.    1     0.0    0.0    16.0      vpsubb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 32.    1     0.0    0.0    15.0      vpsubd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 33.    1     0.0    0.0    15.0      vpsubq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 34.    1     0.0    0.0    15.0      vpsubw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 35.    1     1.0    1.0    13.0      andnps	%xmm0, %xmm0
-# CHECK-NEXT: 36.    1     1.0    1.0    12.0      andnpd	%xmm1, %xmm1
-# CHECK-NEXT: 37.    1     2.0    2.0    11.0      vandnps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT: 38.    1     4.0    2.0    9.0       vandnpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT: 39.    1     3.0    0.0    10.0      vandnps	%ymm2, %ymm2, %ymm2
-# CHECK-NEXT: 40.    1     4.0    0.0    8.0       vandnpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: 41.    1     13.0   0.0    0.0       pandn	%mm2, %mm2
-# CHECK-NEXT: 42.    1     3.0    0.0    10.0      pandn	%xmm2, %xmm2
-# CHECK-NEXT: 43.    1     1.0    1.0    12.0      vpandn	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 44.    1     4.0    1.0    8.0       vandnps	%xmm2, %xmm2, %xmm5
-# CHECK-NEXT: 45.    1     5.0    1.0    7.0       vandnpd	%xmm1, %xmm1, %xmm5
-# CHECK-NEXT: 46.    1     1.0    0.0    11.0      vpandn	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 47.    1     6.0    3.0    6.0       vandnps	%ymm2, %ymm2, %ymm5
-# CHECK-NEXT: 48.    1     6.0    3.0    5.0       vandnpd	%ymm1, %ymm1, %ymm5
-# CHECK-NEXT: 49.    1     0.0    0.0    12.0      xorps	%xmm0, %xmm0
-# CHECK-NEXT: 50.    1     0.0    0.0    12.0      xorpd	%xmm1, %xmm1
-# CHECK-NEXT: 51.    1     0.0    0.0    12.0      vxorps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT: 52.    1     0.0    0.0    11.0      vxorpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT: 53.    1     0.0    0.0    11.0      vxorps	%ymm2, %ymm2, %ymm2
-# CHECK-NEXT: 54.    1     0.0    0.0    11.0      vxorpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: 55.    1     11.0   0.0    0.0       pxor	%mm2, %mm2
-# CHECK-NEXT: 56.    1     0.0    0.0    11.0      pxor	%xmm2, %xmm2
-# CHECK-NEXT: 57.    1     0.0    0.0    11.0      vpxor	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 58.    1     0.0    0.0    11.0      vxorps	%xmm4, %xmm4, %xmm5
-# CHECK-NEXT: 59.    1     0.0    0.0    11.0      vxorpd	%xmm1, %xmm1, %xmm3
-# CHECK-NEXT: 60.    1     0.0    0.0    10.0      vxorps	%ymm4, %ymm4, %ymm5
-# CHECK-NEXT: 61.    1     0.0    0.0    10.0      vxorpd	%ymm1, %ymm1, %ymm3
-# CHECK-NEXT: 62.    1     0.0    0.0    10.0      vpxor	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:        1     1.9    0.3    8.9       <total>
+# CHECK-NEXT: 22.    1     13.0   0.0    0.0       psubw	%mm2, %mm2
+# CHECK-NEXT: 23.    1     0.0    0.0    16.0      psubb	%xmm2, %xmm2
+# CHECK-NEXT: 24.    1     0.0    0.0    15.0      psubd	%xmm2, %xmm2
+# CHECK-NEXT: 25.    1     0.0    0.0    15.0      psubq	%xmm2, %xmm2
+# CHECK-NEXT: 26.    1     0.0    0.0    15.0      psubw	%xmm2, %xmm2
+# CHECK-NEXT: 27.    1     0.0    0.0    15.0      vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 28.    1     0.0    0.0    14.0      vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 29.    1     0.0    0.0    14.0      vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 30.    1     0.0    0.0    14.0      vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 31.    1     0.0    0.0    14.0      vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 32.    1     0.0    0.0    13.0      vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 33.    1     0.0    0.0    13.0      vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 34.    1     0.0    0.0    13.0      vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 35.    1     1.0    1.0    11.0      andnps	%xmm0, %xmm0
+# CHECK-NEXT: 36.    1     1.0    1.0    10.0      andnpd	%xmm1, %xmm1
+# CHECK-NEXT: 37.    1     2.0    2.0    9.0       vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: 38.    1     4.0    2.0    7.0       vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: 39.    1     3.0    0.0    8.0       vandnps	%ymm2, %ymm2, %ymm2
+# CHECK-NEXT: 40.    1     4.0    0.0    6.0       vandnpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 41.    1     11.0   0.0    0.0       pandn	%mm2, %mm2
+# CHECK-NEXT: 42.    1     3.0    0.0    8.0       pandn	%xmm2, %xmm2
+# CHECK-NEXT: 43.    1     1.0    1.0    10.0      vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 44.    1     4.0    1.0    6.0       vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT: 45.    1     5.0    1.0    5.0       vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT: 46.    1     1.0    0.0    9.0       vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 47.    1     6.0    3.0    4.0       vandnps	%ymm2, %ymm2, %ymm5
+# CHECK-NEXT: 48.    1     6.0    3.0    3.0       vandnpd	%ymm1, %ymm1, %ymm5
+# CHECK-NEXT: 49.    1     0.0    0.0    10.0      xorps	%xmm0, %xmm0
+# CHECK-NEXT: 50.    1     0.0    0.0    10.0      xorpd	%xmm1, %xmm1
+# CHECK-NEXT: 51.    1     0.0    0.0    10.0      vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: 52.    1     0.0    0.0    9.0       vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: 53.    1     0.0    0.0    9.0       vxorps	%ymm2, %ymm2, %ymm2
+# CHECK-NEXT: 54.    1     0.0    0.0    9.0       vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 55.    1     9.0    0.0    0.0       pxor	%mm2, %mm2
+# CHECK-NEXT: 56.    1     0.0    0.0    9.0       pxor	%xmm2, %xmm2
+# CHECK-NEXT: 57.    1     0.0    0.0    9.0       vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 58.    1     0.0    0.0    9.0       vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT: 59.    1     0.0    0.0    9.0       vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT: 60.    1     0.0    0.0    8.0       vxorps	%ymm4, %ymm4, %ymm5
+# CHECK-NEXT: 61.    1     0.0    0.0    8.0       vxorpd	%ymm1, %ymm1, %ymm3
+# CHECK-NEXT: 62.    1     0.0    0.0    8.0       vpxor	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:        1     1.8    0.3    7.7       <total>
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
index ef5a9e3..2711bc4 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
@@ -1461,8 +1461,8 @@ vzeroupper
 # CHECK-NEXT:  4      9     2.00    *                   vphaddsw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  3      3     2.00                        vphaddw	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  4      9     2.00    *                   vphaddw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      4     0.50                        vphminposuw	%xmm0, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vphminposuw	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        vphminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    1.00    *                   vphminposuw	(%rax), %xmm2
 # CHECK-NEXT:  3      3     2.00                        vphsubd	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  4      9     2.00    *                   vphsubd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  3      3     2.00                        vphsubsw	%xmm0, %xmm1, %xmm2
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     126.00 339.58 199.58 173.83 173.83 38.00  326.58 6.25   11.33
+# CHECK-NEXT:  -     126.00 340.58 198.58 173.83 173.83 38.00  326.58 6.25   11.33
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -2171,8 +2171,8 @@ vzeroupper
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -     2.00    -      -     vphaddsw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.33   0.33    -      -      -     2.33    -      -     vphaddw	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     2.33    -      -     vphaddw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vphminposuw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vphminposuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vphminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vphminposuw	(%rax), %xmm2
 # CHECK-NEXT:  -      -     0.33   0.33    -      -      -     2.33    -      -     vphsubd	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     2.33    -      -     vphsubd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -     2.00    -      -     vphsubsw	%xmm0, %xmm1, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
index 1d8d67f..0eec7e4 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
@@ -189,8 +189,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  2      3     1.00                        pextrq	$1, %xmm0, %rcx
 # CHECK-NEXT:  3      2     1.00           *            pextrq	$1, %xmm0, (%rax)
 # CHECK-NEXT:  3      2     1.00           *            pextrw	$1, %xmm0, (%rax)
-# CHECK-NEXT:  1      4     0.50                        phminposuw	%xmm0, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   phminposuw	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        phminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    1.00    *                   phminposuw	(%rax), %xmm2
 # CHECK-NEXT:  2      2     2.00                        pinsrb	$1, %eax, %xmm1
 # CHECK-NEXT:  2      6     1.00    *                   pinsrb	$1, (%rax), %xmm1
 # CHECK-NEXT:  2      2     2.00                        pinsrd	$1, %eax, %xmm1
@@ -266,7 +266,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     37.83  31.33  23.67  23.67  5.00   63.33  0.50   1.67
+# CHECK-NEXT:  -      -     38.83  30.33  23.67  23.67  5.00   63.33  0.50   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -304,8 +304,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     pextrq	$1, %xmm0, %rcx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00   1.00    -     0.33   pextrq	$1, %xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00   1.00    -     0.33   pextrw	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     phminposuw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     phminposuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     phminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     phminposuw	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     pinsrb	$1, %eax, %xmm1
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -     pinsrb	$1, (%rax), %xmm1
 # CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     pinsrd	$1, %eax, %xmm1
diff --git a/llvm/tools/llvm-driver/llvm-driver.cpp b/llvm/tools/llvm-driver/llvm-driver.cpp
index 53a8b93..14ce162 100644
--- a/llvm/tools/llvm-driver/llvm-driver.cpp
+++ b/llvm/tools/llvm-driver/llvm-driver.cpp
@@ -33,7 +33,7 @@ static void printHelpMessage() {
                << subcommands
                << "\n  Type \"llvm <subcommand> --help\" to get more help on a "
                   "specific subcommand\n\n"
-               << "OPTIONS:\n\n  --help - Display this message";
+               << "OPTIONS:\n\n  --help - Display this message\n";
 }
 
 static int findTool(int Argc, char **Argv, const char *Argv0) {
diff --git a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
index 629996d..3d36d80 100644
--- a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
+++ b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
@@ -232,7 +232,7 @@ static std::unique_ptr<MachineFunction> cloneMF(MachineFunction *SrcMF,
                                                 MachineModuleInfo &DestMMI) {
   auto DstMF = std::make_unique<MachineFunction>(
       SrcMF->getFunction(), SrcMF->getTarget(), SrcMF->getSubtarget(),
-      SrcMF->getFunctionNumber(), DestMMI);
+      SrcMF->getContext(), SrcMF->getFunctionNumber());
   DenseMap<MachineBasicBlock *, MachineBasicBlock *> Src2DstMBB;
 
   auto *SrcMRI = &SrcMF->getRegInfo();
diff --git a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
index 877793a..79e27c7 100644
--- a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
+++ b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
@@ -61,8 +61,8 @@ protected:
 
     MachineModuleInfo MMI(TM.get());
 
-    MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F), 0,
-                                      MMI);
+    MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
+                                           MMI.getContext(), 0);
 
     DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
     if (!DAG)
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index 28cfb30..d5365d9 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -94,7 +94,7 @@ public:
     const TargetSubtargetInfo &STI = *Machine->getSubtargetImpl(*F);
 
     MF = std::make_unique<MachineFunction>(*F, (LLVMTargetMachine &)*Machine,
-                                           STI, FunctionNum, *MMI);
+                                           STI, MMI->getContext(), FunctionNum);
 
     // Create metadata: CU, subprogram, some blocks and an inline function
     // scope.
diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc
index 1997e80..d464a16 100644
--- a/llvm/unittests/CodeGen/MFCommon.inc
+++ b/llvm/unittests/CodeGen/MFCommon.inc
@@ -135,6 +135,7 @@ std::unique_ptr<MachineFunction> createMachineFunction(LLVMContext &Ctx,
   MachineModuleInfo MMI(TM);
   const TargetSubtargetInfo &STI = *TM->getSubtargetImpl(*F);
 
-  return std::make_unique<MachineFunction>(*F, *TM, STI, FunctionNum, MMI);
+  return std::make_unique<MachineFunction>(*F, *TM, STI, MMI.getContext(),
+                                           FunctionNum);
 }
 
diff --git a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
index ef30bd4..c89e5a4 100644
--- a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
@@ -72,7 +72,7 @@ protected:
     MachineModuleInfo MMI(TM.get());
 
     MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
-                                           0, MMI);
+                                           MMI.getContext(), 0);
 
     DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
     if (!DAG)
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index e318f46..549931b 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -71,7 +71,7 @@ protected:
     MachineModuleInfo MMI(TM.get());
 
     MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
-                                           0, MMI);
+                                           MMI.getContext(), 0);
 
     DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
     if (!DAG)
@@ -131,6 +131,12 @@ TEST_F(SelectionDAGPatternMatchTest, matchTernaryOp) {
   SDValue ICMP_EQ01 = DAG->getSetCC(DL, MVT::i1, Op0, Op1, ISD::SETEQ);
   SDValue ICMP_EQ10 = DAG->getSetCC(DL, MVT::i1, Op1, Op0, ISD::SETEQ);
 
+  auto Int1VT = EVT::getIntegerVT(Context, 1);
+  SDValue Cond = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 3, Int1VT);
+  SDValue T = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 4, Int1VT);
+  SDValue F = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 5, Int1VT);
+  SDValue Select = DAG->getSelect(DL, MVT::i1, Cond, T, F);
+
   using namespace SDPatternMatch;
   ISD::CondCode CC;
   EXPECT_TRUE(sd_match(ICMP_UGT, m_SetCC(m_Value(), m_Value(),
@@ -153,6 +159,13 @@ TEST_F(SelectionDAGPatternMatchTest, matchTernaryOp) {
                                             m_SpecificCondCode(ISD::SETEQ))));
   EXPECT_TRUE(sd_match(ICMP_EQ10, m_c_SetCC(m_Specific(Op0), m_Specific(Op1),
                                             m_SpecificCondCode(ISD::SETEQ))));
+
+  EXPECT_TRUE(sd_match(
+      Select, m_Select(m_Specific(Cond), m_Specific(T), m_Specific(F))));
+  EXPECT_FALSE(sd_match(
+      Select, m_Select(m_Specific(Cond), m_Specific(F), m_Specific(T))));
+  EXPECT_FALSE(sd_match(ICMP_EQ01, m_Select(m_Specific(Op0), m_Specific(Op1),
+                                            m_SpecificCondCode(ISD::SETEQ))));
 }
 
 TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
diff --git a/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp b/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
index ece288a..5ac4eda 100644
--- a/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
+++ b/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
@@ -29,7 +29,8 @@ TEST(AMDGPU, ExecMayBeModifiedBeforeAnyUse) {
   auto *F = Function::Create(Type, GlobalValue::ExternalLinkage, "Test", &Mod);
 
   MachineModuleInfo MMI(TM.get());
-  auto MF = std::make_unique<MachineFunction>(*F, *TM, ST, 42, MMI);
+  auto MF =
+      std::make_unique<MachineFunction>(*F, *TM, ST, MMI.getContext(), 42);
   auto *BB = MF->CreateMachineBasicBlock();
   MF->push_back(BB);
 
diff --git a/llvm/unittests/Target/AMDGPU/PALMetadata.cpp b/llvm/unittests/Target/AMDGPU/PALMetadata.cpp
index 7d4f383..86aa475 100644
--- a/llvm/unittests/Target/AMDGPU/PALMetadata.cpp
+++ b/llvm/unittests/Target/AMDGPU/PALMetadata.cpp
@@ -62,7 +62,7 @@ protected:
                                         TM->getTargetCPU(),
                                         TM->getTargetFeatureString(), *TM);
 
-    MF = std::make_unique<MachineFunction>(*F, *TM, *ST, 1, *MMI);
+    MF = std::make_unique<MachineFunction>(*F, *TM, *ST, MMI->getContext(), 1);
   }
 };
 
diff --git a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
index 62cfda4..fe71161 100644
--- a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
+++ b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
@@ -63,7 +63,7 @@ protected:
         TM->getTargetFeatureString(),
         TM->getTargetTriple().isArch64Bit() ? "lp64" : "ilp32", 0, 0, *TM);
 
-    MF = std::make_unique<MachineFunction>(*F, *TM, *ST, 42, *MMI);
+    MF = std::make_unique<MachineFunction>(*F, *TM, *ST, MMI->getContext(), 42);
   }
 };
 
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index d9d6789..9cf9060 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1140,7 +1140,7 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp,
                              &VecOp, false);
     VPValue EVL;
-    VPReductionEVLRecipe EVLRecipe(&Recipe, &EVL, &CondOp);
+    VPReductionEVLRecipe EVLRecipe(Recipe, EVL, &CondOp);
     EXPECT_FALSE(EVLRecipe.mayHaveSideEffects());
     EXPECT_FALSE(EVLRecipe.mayReadFromMemory());
     EXPECT_FALSE(EVLRecipe.mayWriteToMemory());
@@ -1495,7 +1495,7 @@ TEST(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) {
   VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp,
                            &VecOp, false);
   VPValue EVL;
-  VPReductionEVLRecipe EVLRecipe(&Recipe, &EVL, &CondOp);
+  VPReductionEVLRecipe EVLRecipe(Recipe, EVL, &CondOp);
   EXPECT_TRUE(isa<VPUser>(&EVLRecipe));
   VPRecipeBase *BaseR = &EVLRecipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));
diff --git a/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn
index c6c6ef9..7825396 100644
--- a/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn
@@ -15,7 +15,6 @@ static_library("Interpreter") {
   deps = [
     ":InterpreterProperties",
     ":InterpreterPropertiesEnum",
-    "Interfaces",
     "//lldb/source/Commands",
     "//lldb/source/Core",
     "//lldb/source/DataFormatters",
diff --git a/llvm/utils/gn/secondary/lldb/source/Interpreter/Interfaces/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Interpreter/Interfaces/BUILD.gn
deleted file mode 100644
index 2e70c54..0000000
--- a/llvm/utils/gn/secondary/lldb/source/Interpreter/Interfaces/BUILD.gn
+++ /dev/null
@@ -1,9 +0,0 @@
-static_library("Interfaces") {
-  output_name = "lldbInterpreterInterfaces"
-  configs += [ "//llvm/utils/gn/build:lldb_code" ]
-  deps = [
-    "//lldb/source/Utility",
-    "//llvm/lib/Support",
-  ]
-  sources = [ "ScriptedInterfaceUsages.cpp" ]
-}
diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
index 945fb0e..a2b0603 100644
--- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
@@ -266,6 +266,7 @@ group("test") {
     "//llvm/tools/llvm-cat",
     "//llvm/tools/llvm-cfi-verify",
     "//llvm/tools/llvm-cov",
+    "//llvm/tools/llvm-ctxprof-util",
     "//llvm/tools/llvm-cvtres",
     "//llvm/tools/llvm-cxxdump",
     "//llvm/tools/llvm-cxxfilt",
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-ctxprof-util/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-ctxprof-util/BUILD.gn
new file mode 100644
index 0000000..fd921f6
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-ctxprof-util/BUILD.gn
@@ -0,0 +1,11 @@
+import("//llvm/utils/gn/build/driver_executable.gni")
+
+driver_executable("llvm-ctxprof-util") {
+  deps = [
+    "//llvm/lib/IR",
+    "//llvm/lib/Object",
+    "//llvm/lib/ProfileData",
+    "//llvm/lib/Support",
+  ]
+  sources = [ "llvm-ctxprof-util.cpp" ]
+}
diff --git a/mlir/docs/DefiningDialects/Operations.md b/mlir/docs/DefiningDialects/Operations.md
index 01fadef..c252d9c 100644
--- a/mlir/docs/DefiningDialects/Operations.md
+++ b/mlir/docs/DefiningDialects/Operations.md
@@ -101,6 +101,9 @@ their semantics via a special [TableGen backend][TableGenBackend]:
 *   The `AttrConstraint` class hierarchy: They are used to specify the
     constraints over attributes. A notable subclass hierarchy is `Attr`, which
     stands for constraints for attributes whose values are of common types.
+*   The `Property` class hierarchy: They are used to specify non-attribute-backed
+    properties that are inherent to operations. This will be expanded to a
+    `PropertyConstraint` class or something similar in the future.
 
 An operation is defined by specializing the `Op` class with concrete contents
 for all the fields it requires. For example, `tf.AvgPool` is defined as
@@ -172,9 +175,9 @@ understanding the operation.
 
 ### Operation arguments
 
-There are two kinds of arguments: operands and attributes. Operands are runtime
-values produced by other ops; while attributes are compile-time known constant
-values, including two categories:
+There are three kinds of arguments: operands, attributes, and properties.
+Operands are runtime values produced by other ops; while attributes and properties
+are compile-time known constant values, including two categories:
 
 1.  Natural attributes: these attributes affect the behavior of the operations
     (e.g., padding for convolution);
@@ -187,8 +190,11 @@ values, including two categories:
     even though they are not materialized, it should be possible to store as an
     attribute.
 
-Both operands and attributes are specified inside the `dag`-typed `arguments`,
-led by `ins`:
+Properties are similar to attributes, except that they are not stored within
+the MLIR context but are stored inline with the operation.
+
+Operands, attributes, and properties are specified inside the `dag`-typed
+`arguments`, led by `ins`:
 
 ```tablegen
 let arguments = (ins
@@ -196,13 +202,15 @@ let arguments = (ins
   ...
   <attr-constraint>:$<attr-name>,
   ...
+  <property-constraint>:$<property-name>,
 );
 ```
 
 Here `<type-constraint>` is a TableGen `def` from the `TypeConstraint` class
 hierarchy. Similarly, `<attr-constraint>` is a TableGen `def` from the
-`AttrConstraint` class hierarchy. See [Constraints](#constraints) for more
-information.
+`AttrConstraint` class hierarchy and `<property-constraint>` is a subclass
+of `Property` (though a `PropertyConstraint` hierarchy is planned).
+See [Constraints](#constraints) for more information.
 
 There is no requirements on the relative order of operands and attributes; they
 can mix freely. The relative order of operands themselves matters. From each
@@ -324,6 +332,18 @@ Right now, the following primitive constraints are supported:
 
 TODO: Design and implement more primitive constraints
 
+#### Optional and default-valued properties
+
+To declare a property with a default value, use `DefaultValuedProperty<..., "...">`.
+If the property's storage data type is different from its interface type,
+for example, in the case of array properties (which are stored as `SmallVector`s
+but use `ArrayRef` as an interface type), add the storage-type equivalent
+of the default value as the third argument.
+
+To declare an optional property, use `OptionalProperty<...>`.
+This wraps the underlying property in an `std::optional` and gives it a
+default value of `std::nullopt`.
+
 #### Combining constraints
 
 `AllAttrOf` is provided to allow combination of multiple constraints which
@@ -429,6 +449,8 @@ def MyOp : ... {
     I32Attr:$i32_attr,
     F32Attr:$f32_attr,
     ...
+    I32Property:$i32_prop,
+    ...
   );
 
   let results = (outs
@@ -453,7 +475,8 @@ static void build(OpBuilder &odsBuilder, OperationState &odsState,
 static void build(OpBuilder &odsBuilder, OperationState &odsState,
                   Type i32_result, Type f32_result, ...,
                   Value i32_operand, Value f32_operand, ...,
-                  IntegerAttr i32_attr, FloatAttr f32_attr, ...);
+                  IntegerAttr i32_attr, FloatAttr f32_attr, ...,
+                  int32_t i32_prop);
 
 // Each result-type/operand/attribute has a separate parameter. The parameters
 // for attributes are raw values unwrapped with mlir::Attribute instances.
@@ -462,13 +485,15 @@ static void build(OpBuilder &odsBuilder, OperationState &odsState,
 static void build(OpBuilder &odsBuilder, OperationState &odsState,
                   Type i32_result, Type f32_result, ...,
                   Value i32_operand, Value f32_operand, ...,
-                  APInt i32_attr, StringRef f32_attr, ...);
+                  APInt i32_attr, StringRef f32_attr, ...,
+                  int32_t i32_prop, ...);
 
 // Each operand/attribute has a separate parameter but result type is aggregate.
 static void build(OpBuilder &odsBuilder, OperationState &odsState,
                   TypeRange resultTypes,
                   Value i32_operand, Value f32_operand, ...,
-                  IntegerAttr i32_attr, FloatAttr f32_attr, ...);
+                  IntegerAttr i32_attr, FloatAttr f32_attr, ...,
+                  int32_t i32_prop, ...);
 
 // All operands/attributes have aggregate parameters.
 // Generated if return type can be inferred.
@@ -921,8 +946,10 @@ optional-group: `(` then-elements `)` (`:` `(` else-elements `)`)? `?`
 The elements of an optional group have the following requirements:
 
 *   The first element of `then-elements` must either be a attribute, literal,
-    operand, or region.
+    operand, property, or region.
     -   This is because the first element must be optionally parsable.
+    -   If a property is used, it must have an `optionalParser` defined and have a
+        default value.
 *   Exactly one argument variable or type directive within either
     `then-elements` or `else-elements` must be marked as the anchor of the
     group.
@@ -984,6 +1011,8 @@ foo.op is_read_only
 foo.op
 ```
 
+The same logic applies to a `UnitProperty`.
+
 ##### Optional "else" Group
 
 Optional groups also have support for an "else" group of elements. These are
@@ -1026,6 +1055,8 @@ to:
 1.  All operand and result types must appear within the format using the various
     `type` directives, either individually or with the `operands` or `results`
     directives.
+1.  Unless all non-attribute properties appear in the format, the `prop-dict`
+    directive must be present.
 1.  The `attr-dict` directive must always be present.
 1.  Must not contain overlapping information; e.g. multiple instances of
     'attr-dict', types, operands, etc.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index b14bd83..260d421 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -59,22 +59,9 @@ class LLVM_IntArithmeticOpWithOverflowFlag<string mnemonic, string instName,
                                    list<Trait> traits = []> :
     LLVM_ArithmeticOpBase<AnySignlessInteger, mnemonic, instName,
     !listconcat([DeclareOpInterfaceMethods<IntegerOverflowFlagsInterface>], traits)> {
-  dag iofArg = (ins EnumProperty<"IntegerOverflowFlags">:$overflowFlags);
+  dag iofArg = (ins EnumProperty<"IntegerOverflowFlags", "", "IntegerOverflowFlags::none">:$overflowFlags);
   let arguments = !con(commonArgs, iofArg);
 
-  let builders = [
-    OpBuilder<(ins "Type":$type, "Value":$lhs, "Value":$rhs,
-                   "IntegerOverflowFlags":$overflowFlags), [{
-      $_state.getOrAddProperties<Properties>().overflowFlags = overflowFlags;
-      build($_builder, $_state, type, lhs, rhs);
-    }]>,
-    OpBuilder<(ins "Value":$lhs, "Value":$rhs,
-                   "IntegerOverflowFlags":$overflowFlags), [{
-      $_state.getOrAddProperties<Properties>().overflowFlags = overflowFlags;
-      build($_builder, $_state, lhs, rhs);
-    }]>
-  ];
-
   string mlirBuilder = [{
     auto op = $_builder.create<$_qualCppClassName>($_location, $lhs, $rhs);
     moduleImport.setIntegerOverflowFlags(inst, op);
diff --git a/mlir/include/mlir/IR/ODSSupport.h b/mlir/include/mlir/IR/ODSSupport.h
index 70e3f98..25d6f3d 100644
--- a/mlir/include/mlir/IR/ODSSupport.h
+++ b/mlir/include/mlir/IR/ODSSupport.h
@@ -33,6 +33,37 @@ convertFromAttribute(int64_t &storage, Attribute attr,
 /// Convert the provided int64_t to an IntegerAttr attribute.
 Attribute convertToAttribute(MLIRContext *ctx, int64_t storage);
 
+/// Convert an IntegerAttr attribute to an int32_t, or return an error if the
+/// attribute isn't an IntegerAttr. If the optional diagnostic is provided an
+/// error message is also emitted.
+LogicalResult
+convertFromAttribute(int32_t &storage, Attribute attr,
+                     function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert the provided int32_t to an IntegerAttr attribute.
+Attribute convertToAttribute(MLIRContext *ctx, int32_t storage);
+
+/// Extract the string from `attr` into `storage`. If `attr` is not a
+/// `StringAttr`, return failure and emit an error into the diagnostic from
+/// `emitError`.
+LogicalResult
+convertFromAttribute(std::string &storage, Attribute attr,
+                     function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert the given string into a StringAttr. Note that this takes a reference
+/// to the storage of a string property, which is an std::string.
+Attribute convertToAttribute(MLIRContext *ctx, const std::string &storage);
+
+/// Extract the boolean from `attr` into `storage`. If `attr` is not a
+/// `BoolAttr`, return failure and emit an error into the diagnostic from
+/// `emitError`.
+LogicalResult
+convertFromAttribute(bool &storage, Attribute attr,
+                     function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert the given string into a BooleanAttr.
+Attribute convertToAttribute(MLIRContext *ctx, bool storage);
+
 /// Convert a DenseI64ArrayAttr to the provided storage. It is expected that the
 /// storage has the same size as the array. An error is returned if the
 /// attribute isn't a DenseI64ArrayAttr or it does not have the same size. If
@@ -49,9 +80,24 @@ LogicalResult
 convertFromAttribute(MutableArrayRef<int32_t> storage, Attribute attr,
                      function_ref<InFlightDiagnostic()> emitError);
 
+/// Convert a DenseI64ArrayAttr to the provided storage, which will be
+/// cleared before writing. An error is returned and emitted to the optional
+/// `emitError` function if the attribute isn't a DenseI64ArrayAttr.
+LogicalResult
+convertFromAttribute(SmallVectorImpl<int64_t> &storage, Attribute attr,
+                     function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert a DenseI32ArrayAttr to the provided storage, which will be
+/// cleared before writing. It is expected that the storage has the same size as
+/// the array. An error is returned and emitted to the optional `emitError`
+/// function if the attribute isn't a DenseI32ArrayAttr.
+LogicalResult
+convertFromAttribute(SmallVectorImpl<int32_t> &storage, Attribute attr,
+                     function_ref<InFlightDiagnostic()> emitError);
+
 /// Convert the provided ArrayRef<int64_t> to a DenseI64ArrayAttr attribute.
 Attribute convertToAttribute(MLIRContext *ctx, ArrayRef<int64_t> storage);
 
 } // namespace mlir
 
-#endif // MLIR_IR_ODSSUPPORT_H
-\ No newline at end of file
+#endif // MLIR_IR_ODSSUPPORT_H
diff --git a/mlir/include/mlir/IR/Properties.td b/mlir/include/mlir/IR/Properties.td
index 0babdbb..0becf7d 100644
--- a/mlir/include/mlir/IR/Properties.td
+++ b/mlir/include/mlir/IR/Properties.td
@@ -29,7 +29,6 @@ class Property<string storageTypeParam = "", string desc = ""> {
   //
   // Format:
   // - `$_storage` will contain the property in the storage type.
-  // - `$_ctxt` will contain an `MLIRContext *`.
   code convertFromStorage = "$_storage";
 
   // The call expression to build a property storage from the interface type.
@@ -40,24 +39,26 @@ class Property<string storageTypeParam = "", string desc = ""> {
   code assignToStorage = "$_storage = $_value";
 
   // The call expression to convert from the storage type to an attribute.
+  // The resulting attribute must be non-null in non-error cases.
   //
   // Format:
   // - `$_storage` is the storage type value.
   // - `$_ctxt` is a `MLIRContext *`.
   //
-  // The expression must result in an Attribute.
+  // The expression must return an `Attribute` and will be used as a function body.
   code convertToAttribute = [{
-    convertToAttribute($_ctxt, $_storage)
+    return convertToAttribute($_ctxt, $_storage);
   }];
 
   // The call expression to convert from an Attribute to the storage type.
   //
   // Format:
-  // - `$_storage` is the storage type value.
+  // - `$_storage` is a reference to a value of the storage type.
   // - `$_attr` is the attribute.
   // - `$_diag` is a callback to get a Diagnostic to emit error.
   //
-  // The expression must return a LogicalResult
+  // The expression must return a LogicalResult and will be used as a function body
+  // or in other similar contexts.
   code convertFromAttribute = [{
     return convertFromAttribute($_storage, $_attr, $_diag);
   }];
@@ -68,18 +69,68 @@ class Property<string storageTypeParam = "", string desc = ""> {
   // - `$_storage` is the variable to hash.
   //
   // The expression should define a llvm::hash_code.
-  code hashProperty = [{
-    llvm::hash_value($_storage);
+  // If unspecified, defaults to `llvm::hash_value($_storage)`.
+  // The default is not specified in tablegen because many combinators, like
+  // ArrayProperty, can fall back to more efficient implementations of
+  // `hashProperty` when their underlying elements have trivial hashing.
+  code hashProperty = "";
+
+  // The body of the parser for a value of this property.
+  // Format:
+  // - `$_parser` is the OpAsmParser.
+  // - `$_storage` is the location into which the value is to be placed if it is
+  //  present.
+  // - `$_ctxt` is a `MLIRContext *`
+  //
+  // This defines the body of a function (typically a lambda) that returns a
+  // ParseResult. There is an implicit `return success()` at the end of the parser
+  // code.
+  //
+  // When this code executes, `$_storage` will be initialized to the property's
+  // default value (if any, accounting for the storage type override).
+  code parser = [{
+    auto value = ::mlir::FieldParser<}] # storageType # [{>::parse($_parser);
+    if (::mlir::failed(value))
+      return ::mlir::failure();
+    $_storage = std::move(*value);
   }];
 
+  // The body of the parser for a value of this property as the anchor of an optional
+  // group. This should parse the property if possible and do nothing if a value of
+  // the relevant type is not next in the parse stream.
+  // You are not required to define this parser if it cannot be meaningfully
+  // implemented.
+  // This has the same context and substitutions as `parser` except that it is
+  // required to return an OptionalParseResult.
+  //
+  // If the optional parser doesn't parse anything, it should not set
+  // $_storage, since the parser doesn't know if the default value has been
+  // overwritten.
+  code optionalParser = "";
+
+  // The printer for a value of this property.
+  // Format:
+  // - `$_storage` is the storage data.
+  // - `$_printer` is the OpAsmPrinter instance.
+  // - `$_ctxt` is a `MLIRContext *`
+  //
+  // This may be called in an expression context, so variable declarations must
+  // be placed within a new scope.
+  //
+  // The printer for a property should always print a non-empty value - default value
+  // printing elision happens outside the context of this printing expression.
+  code printer = "$_printer << $_storage";
+
   // The call expression to emit the storage type to bytecode.
   //
   // Format:
   // - `$_storage` is the storage type value.
   // - `$_writer` is a `DialectBytecodeWriter`.
   // - `$_ctxt` is a `MLIRContext *`.
+  //
+  // This will become the body af a function returning void.
   code writeToMlirBytecode = [{
-    writeToMlirBytecode($_writer, $_storage)
+    writeToMlirBytecode($_writer, $_storage);
   }];
 
   // The call expression to read the storage type from bytecode.
@@ -88,13 +139,31 @@ class Property<string storageTypeParam = "", string desc = ""> {
   // - `$_storage` is the storage type value.
   // - `$_reader` is a `DialectBytecodeReader`.
   // - `$_ctxt` is a `MLIRContext *`.
+  //
+  // This will become the body of a function returning LogicalResult.
+  // There is an implicit `return success()` at the end of this function.
+  //
+  // When this code executes, `$_storage` will be initialized to the property's
+  // default value (if any, accounting for the storage type override).
   code readFromMlirBytecode = [{
     if (::mlir::failed(readFromMlirBytecode($_reader, $_storage)))
       return ::mlir::failure();
   }];
 
-  // Default value for the property.
-  string defaultValue = ?;
+  // Base definition for the property. (Will be) used for `OptionalProperty` and
+  // such cases, analogously to `baseAttr`.
+  Property baseProperty = ?;
+
+  // Default value for the property within its storage. This should be an expression
+  // of type `interfaceType` and should be comparable with other types of that
+  // interface typ with `==`. The empty string means there is no default value.
+  string defaultValue = "";
+
+  // If set, the default value the storage of the property should be initilized to.
+  // This is only needed when the storage and interface types of the property
+  // are distinct (ex. SmallVector for storage vs. ArrayRef for interfacing), as it
+  // will fall back to `defaultValue` when unspecified.
+  string storageTypeValueOverride = "";
 }
 
 /// Implementation of the Property class's `readFromMlirBytecode` field using
@@ -133,12 +202,16 @@ defvar writeMlirBytecodeWithConvertToAttribute = [{
 // Primitive property kinds
 
 // Any kind of integer stored as properties.
-class IntProperty<string storageTypeParam = "", string desc = ""> :
+class IntProperty<string storageTypeParam, string desc = ""> :
     Property<storageTypeParam, desc> {
-  code writeToMlirBytecode = [{
+  let summary = !if(!empty(desc), storageTypeParam, desc);
+  let optionalParser = [{
+    return $_parser.parseOptionalInteger($_storage);
+  }];
+  let writeToMlirBytecode = [{
     $_writer.writeVarInt($_storage);
   }];
-  code readFromMlirBytecode = [{
+  let readFromMlirBytecode = [{
     uint64_t val;
     if (failed($_reader.readVarInt(val)))
       return ::mlir::failure();
@@ -146,24 +219,472 @@ class IntProperty<string storageTypeParam = "", string desc = ""> :
   }];
 }
 
-class ArrayProperty<string storageTypeParam = "", int n, string desc = ""> :
-  Property<storageTypeParam # "[" # n # "]", desc> {
-  let interfaceType = "::llvm::ArrayRef<" # storageTypeParam # ">";
-  let convertFromStorage = "$_storage";
-  let assignToStorage = "::llvm::copy($_value, $_storage)";
-}
+def I32Property : IntProperty<"int32_t">;
+def I64Property : IntProperty<"int64_t">;
 
-class EnumProperty<string storageTypeParam, string desc = ""> :
+class EnumProperty<string storageTypeParam, string desc = "", string default = ""> :
     Property<storageTypeParam, desc> {
-  code writeToMlirBytecode = [{
+  // TODO: take advantage of EnumAttrInfo and the like to make this share nice
+  // parsing code with EnumAttr.
+  let writeToMlirBytecode = [{
     $_writer.writeVarInt(static_cast<uint64_t>($_storage));
   }];
-  code readFromMlirBytecode = [{
+  let readFromMlirBytecode = [{
     uint64_t val;
     if (failed($_reader.readVarInt(val)))
       return ::mlir::failure();
     $_storage = static_cast<}] # storageTypeParam # [{>(val);
   }];
+  let defaultValue = default;
 }
 
+def StringProperty : Property<"std::string", "string"> {
+  let interfaceType = "::llvm::StringRef";
+  let convertFromStorage = "::llvm::StringRef{$_storage}";
+  let assignToStorage = "$_storage = $_value.str()";
+  let optionalParser = [{
+    if (::mlir::failed($_parser.parseOptionalString(&$_storage)))
+      return std::nullopt;
+  }];
+  let printer = "$_printer.printString($_storage)";
+  let readFromMlirBytecode = [{
+    StringRef val;
+    if (::mlir::failed($_reader.readString(val)))
+      return ::mlir::failure();
+    $_storage = val.str();
+  }];
+  let writeToMlirBytecode = [{
+    $_writer.writeOwnedString($_storage);
+  }];
+}
+
+def BoolProperty : IntProperty<"bool", "boolean"> {
+  let printer = [{ $_printer << ($_storage ? "true" : "false") }];
+  let readFromMlirBytecode = [{
+    return $_reader.readBool($_storage);
+  }];
+  let writeToMlirBytecode = [{
+    $_writer.writeOwnedBool($_storage);
+  }];
+}
+
+def UnitProperty : Property<"bool", "unit property"> {
+  let summary = "unit property";
+  let description = [{
+    A property whose presence or abscence is used as a flag.
+
+    This is stored as a boolean that defaults to false, and is named UnitProperty
+    by analogy with UnitAttr, which has the more comprehensive rationale and
+    explains the less typical syntax.
+
+    Note that this attribute does have a syntax for the false case to allow for its
+    use in contexts where default values shouldn't be elided.
+  }];
+  let defaultValue = "false";
+
+  let convertToAttribute = [{
+    if ($_storage)
+      return ::mlir::UnitAttr::get($_ctxt);
+    else
+      return ::mlir::BoolAttr::get($_ctxt, false);
+  }];
+  let convertFromAttribute = [{
+    if (::llvm::isa<::mlir::UnitAttr>($_attr)) {
+      $_storage = true;
+      return ::mlir::success();
+    }
+    if (auto boolAttr = ::llvm::dyn_cast<::mlir::BoolAttr>($_attr)) {
+      $_storage = boolAttr.getValue();
+      return ::mlir::success();
+    }
+    return ::mlir::failure();
+  }];
+
+  let parser = [{
+    ::llvm::StringRef keyword;
+    if (::mlir::failed($_parser.parseOptionalKeyword(&keyword,
+        {"unit", "unit_absent"})))
+      return $_parser.emitError($_parser.getCurrentLocation(),
+        "expected 'unit' or 'unit_absent'");
+    $_storage = (keyword == "unit");
+  }];
+
+  let optionalParser = [{
+    ::llvm::StringRef keyword;
+    if (::mlir::failed($_parser.parseOptionalKeyword(&keyword,
+        {"unit", "unit_absent"})))
+      return std::nullopt;
+    $_storage = (keyword == "unit");
+  }];
+
+  let printer = [{
+    $_printer << ($_storage ? "unit" : "unit_absent")
+  }];
+
+  let writeToMlirBytecode = [{
+    $_writer.writeOwnedBool($_storage);
+  }];
+  let readFromMlirBytecode = [{
+    if (::mlir::failed($_reader.readBool($_storage)))
+      return ::mlir::failure();
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Primitive property combinators
+
+/// Create a variable named `name` of `prop`'s storage type that is initialized
+/// to the correct default value, if there is one.
+class _makePropStorage<Property prop, string name> {
+  code ret = prop.storageType # " " # name
+      # !cond(!not(!empty(prop.storageTypeValueOverride)) : " = " # prop.storageTypeValueOverride,
+        !not(!empty(prop.defaultValue)) : " = " # prop.defaultValue,
+        true : "") # ";";
+}
+
+/// The generic class for arrays of some other property, which is stored as a
+/// `SmallVector` of that property. This uses an `ArrayAttr` as its attribute form
+/// though subclasses can override this, as is the case with IntArrayAttr below.
+/// Those wishing to use a non-default number of SmallVector elements should
+/// subclass `ArrayProperty`.
+class ArrayProperty<Property elem = Property<>, string desc = ""> :
+  Property<"::llvm::SmallVector<" # elem.storageType # ">", desc> {
+  let summary = "array of " # elem.summary;
+  let interfaceType = "::llvm::ArrayRef<" # elem.storageType # ">";
+  let convertFromStorage = "::llvm::ArrayRef<" # elem.storageType # ">{$_storage}";
+  let assignToStorage = "$_storage.assign($_value.begin(), $_value.end())";
+
+  let convertFromAttribute = [{
+    auto arrayAttr = ::llvm::dyn_cast_if_present<::mlir::ArrayAttr>($_attr);
+    if (!arrayAttr)
+      return $_diag() << "expected array attribute";
+    for (::mlir::Attribute elemAttr : arrayAttr) {
+      }] # _makePropStorage<elem, "elemVal">.ret # [{
+      auto elemRes = [&](Attribute propAttr, }] # elem.storageType # [{& propStorage) -> ::mlir::LogicalResult {
+        }] # !subst("$_attr", "propAttr",
+          !subst("$_storage", "propStorage", elem.convertFromAttribute)) # [{
+      }(elemAttr, elemVal);
+      if (::mlir::failed(elemRes))
+        return ::mlir::failure();
+      $_storage.push_back(std::move(elemVal));
+    }
+    return ::mlir::success();
+  }];
+
+  let convertToAttribute = [{
+    SmallVector<Attribute> elems;
+    for (const auto& elemVal : $_storage) {
+      auto elemAttr = [&](const }] # elem.storageType #[{& propStorage) -> ::mlir::Attribute {
+        }] # !subst("$_storage", "propStorage", elem.convertToAttribute) # [{
+      }(elemVal);
+      elems.push_back(elemAttr);
+    }
+    return ::mlir::ArrayAttr::get($_ctxt, elems);
+  }];
+
+  defvar theParserBegin = [{
+    auto& storage = $_storage;
+    auto parseElemFn = [&]() -> ::mlir::ParseResult {
+      }] # _makePropStorage<elem, "elemVal">.ret # [{
+      auto elemParse = [&](}] # elem.storageType # [{& propStorage) -> ::mlir::ParseResult {
+        }] # !subst("$_storage", "propStorage", elem.parser) # [{
+        return ::mlir::success();
+       }(elemVal);
+      if (::mlir::failed(elemParse))
+        return ::mlir::failure();
+      storage.push_back(std::move(elemVal));
+      return ::mlir::success();
+    };
+    }];
+  let parser = theParserBegin # [{
+    return $_parser.parseCommaSeparatedList(
+      ::mlir::OpAsmParser::Delimiter::Square, parseElemFn);
+  }];
+  // Hack around the lack of a peek method
+  let optionalParser = theParserBegin # [{
+    auto oldLoc = $_parser.getCurrentLocation();
+    auto parseResult = $_parser.parseCommaSeparatedList(
+      ::mlir::OpAsmParser::Delimiter::OptionalSquare, parseElemFn);
+    if (::mlir::failed(parseResult))
+      return ::mlir::failure();
+    auto newLoc = $_parser.getCurrentLocation();
+    if (oldLoc == newLoc)
+      return std::nullopt;
+    return ::mlir::success();
+  }];
+
+  let printer = [{ [&](){
+    $_printer << "[";
+    auto elemPrinter = [&](const }] # elem.storageType # [{& elemVal) {
+      }] # !subst("$_storage", "elemVal", elem.printer) #[{;
+    };
+    ::llvm::interleaveComma($_storage, $_printer, elemPrinter);
+    $_printer << "]";
+  }()}];
+
+  let readFromMlirBytecode = [{
+    uint64_t length;
+    if (::mlir::failed($_reader.readVarInt(length)))
+      return ::mlir::failure();
+    $_storage.reserve(length);
+    for (uint64_t i = 0; i < length; ++i) {
+      }]# _makePropStorage<elem, "elemVal">.ret # [{
+      auto elemRead = [&](}] # elem.storageType # [{& propStorage) -> ::mlir::LogicalResult {
+        }] # !subst("$_storage", "propStorage", elem.readFromMlirBytecode) # [{;
+        return ::mlir::success();
+      }(elemVal);
+      if (::mlir::failed(elemRead))
+        return ::mlir::failure();
+      $_storage.push_back(std::move(elemVal));
+    }
+  }];
+
+  let writeToMlirBytecode = [{
+    $_writer.writeVarInt($_storage.size());
+    for (const auto& elemVal : $_storage) {
+      [&]() {
+        }] # !subst("$_storage", "elemVal", elem.writeToMlirBytecode) #[{;
+      }();
+    }
+  }];
+
+  // There's no hash_value for SmallVector<T>, so we construct the ArrayRef ourselves.
+  // In the non-trivial case, we define a mapped range to get internal hash
+  // codes.
+  let hashProperty = !if(!empty(elem.hashProperty),
+    [{::llvm::hash_value(::llvm::ArrayRef<}] # elem.storageType # [{>{$_storage})}],
+    [{[&]() -> ::llvm::hash_code {
+        auto getElemHash = [](const auto& propStorage) -> ::llvm::hash_code {
+          return }] # !subst("$_storage", "propStorage", elem.hashProperty) # [{;
+        };
+        auto mapped = ::llvm::map_range($_storage, getElemHash);
+        return ::llvm::hash_combine_range(mapped.begin(), mapped.end());
+      }()
+    }]);
+}
+
+class IntArrayProperty<string storageTypeParam = "", string desc = ""> :
+    ArrayProperty<IntProperty<storageTypeParam, desc>> {
+  // Bring back the trivial conversions we don't get in the general case.
+  let convertFromAttribute = [{
+    return convertFromAttribute($_storage, $_attr, $_diag);
+  }];
+  let convertToAttribute = [{
+    return convertToAttribute($_ctxt, $_storage);
+  }];
+}
+
+/// Class for giving a property a default value.
+/// This doesn't change anything about the property other than giving it a default
+/// which can be used by ODS to elide printing.
+class DefaultValuedProperty<Property p, string default = "", string storageDefault = ""> : Property<p.storageType, p.summary> {
+  let defaultValue = default;
+  let storageTypeValueOverride = storageDefault;
+  let baseProperty = p;
+  // Keep up to date with `Property` above.
+  let summary = p.summary;
+  let description = p.description;
+  let storageType = p.storageType;
+  let interfaceType = p.interfaceType;
+  let convertFromStorage = p.convertFromStorage;
+  let assignToStorage = p.assignToStorage;
+  let convertToAttribute = p.convertToAttribute;
+  let convertFromAttribute = p.convertFromAttribute;
+  let hashProperty = p.hashProperty;
+  let parser = p.parser;
+  let optionalParser = p.optionalParser;
+  let printer = p.printer;
+  let readFromMlirBytecode = p.readFromMlirBytecode;
+  let writeToMlirBytecode = p.writeToMlirBytecode;
+}
+
+/// An optional property, stored as an std::optional<p.storageType>
+/// interfaced with as an std::optional<p.interfaceType>..
+/// The syntax is `none` (or empty string if elided) for an absent value or
+/// `some<[underlying property]>` when a value is set.
+///
+/// As a special exception, if the underlying property has an optional parser and
+/// no default value (ex. an integer property), the printer will skip the `some`
+/// bracketing and delegate to the optional parser. In that case, the syntax is the
+/// syntax of the underlying property, or the keyword `none` in the rare cases that
+/// it is needed. This behavior can be disabled by setting `canDelegateParsing` to 0.
+class OptionalProperty<Property p, bit canDelegateParsing = 1>
+    : Property<"std::optional<" # p.storageType # ">", "optional " # p.summary> {
+
+  // In the cases where the underlying attribute is plain old data that's passed by
+  // value, the conversion code is trivial.
+  defvar hasTrivialStorage = !and(!eq(p.convertFromStorage, "$_storage"),
+    !eq(p.assignToStorage, "$_storage = $_value"),
+    !eq(p.storageType, p.interfaceType));
+
+  defvar delegatesParsing = !and(!empty(p.defaultValue),
+    !not(!empty(p.optionalParser)), canDelegateParsing);
+
+  let interfaceType = "std::optional<" # p.interfaceType # ">";
+  let defaultValue = "std::nullopt";
+
+  let convertFromStorage = !if(hasTrivialStorage,
+    p.convertFromStorage,
+    [{($_storage.has_value() ? std::optional<}] # p.interfaceType # ">{"
+      # !subst("$_storage", "(*($_storage))", p.convertFromStorage)
+      # [{} : std::nullopt)}]);
+  let assignToStorage = !if(hasTrivialStorage,
+    p.assignToStorage,
+    [{[&]() {
+      if (!$_value.has_value()) {
+        $_storage = std::nullopt;
+        return;
+      }
+      }] # _makePropStorage<p, "presentVal">.ret # [{
+      [&](}] # p.storageType # [{& propStorage) {
+        }] # !subst("$_storage", "propStorage",
+          !subst("$_value", "(*($_value))", p.assignToStorage)) # [{;
+      }(presentVal);
+      $_storage = std::move(presentVal);
+    }()}]);
+
+  let convertFromAttribute = [{
+    auto arrayAttr = ::llvm::dyn_cast<::mlir::ArrayAttr>($_attr);
+    if (!arrayAttr)
+      return $_diag() << "expected optional properties to materialize as arrays";
+    if (arrayAttr.size() > 1)
+      return $_diag() << "expected optional properties to become 0- or 1-element arrays";
+    if (arrayAttr.empty()) {
+      $_storage = std::nullopt;
+      return ::mlir::success();
+    }
+    ::mlir::Attribute presentAttr = arrayAttr[0];
+    }] # _makePropStorage<p, "presentVal">.ret # [{
+    auto presentRes = [&](Attribute propAttr, }] # p.storageType # [{& propStorage) -> ::mlir::LogicalResult {
+      }] # !subst("$_storage", "propStorage",
+          !subst("$_attr", "propAttr", p.convertFromAttribute)) # [{
+    }(presentAttr, presentVal);
+    if (::mlir::failed(presentRes))
+      return ::mlir::failure();
+    $_storage = std::move(presentVal);
+    return ::mlir::success();
+  }];
+
+  let convertToAttribute = [{
+    if (!$_storage.has_value()) {
+      return ::mlir::ArrayAttr::get($_ctxt, {});
+    }
+    auto attr = [&]() -> ::mlir::Attribute {
+      }] # !subst("$_storage", "(*($_storage))", p.convertToAttribute) # [{
+    }();
+    return ::mlir::ArrayAttr::get($_ctxt, {attr});
+  }];
+
+  defvar delegatedParserBegin = [{
+    if (::mlir::succeeded($_parser.parseOptionalKeyword("none"))) {
+      $_storage = std::nullopt;
+      return ::mlir::success();
+    }
+    }] #_makePropStorage<p, "presentVal">.ret # [{
+    auto delegParseResult = [&](}] # p.storageType # [{& propStorage) -> ::mlir::OptionalParseResult {
+    }] # !subst("$_storage", "propStorage", p.optionalParser) # [{
+        return ::mlir::success();
+    }(presentVal);
+    if (!delegParseResult.has_value()) {
+  }];
+
+  defvar delegatedParserEnd = [{
+    }
+    if (delegParseResult.has_value() && ::mlir::failed(*delegParseResult))
+      return ::mlir::failure();
+    $_storage = std::move(presentVal);
+    return ::mlir::success();
+  }];
+  // If we're being explicitly called for our parser, we're expecting to have been
+  // printede into a context where the default value isn't elided. Therefore,
+  // not-present from the underlying parser is a failure.
+  defvar delegatedParser = delegatedParserBegin # [{
+    return ::mlir::failure();
+  }] # delegatedParserEnd;
+  defvar delegatedOptionalParser = delegatedParserBegin # [{
+      return std::nullopt;
+  }] # delegatedParserEnd;
+
+  defvar generalParserBegin = [{
+    ::llvm::StringRef keyword;
+    if (::mlir::failed($_parser.parseOptionalKeyword(&keyword, {"none", "some"}))) {
+  }];
+  defvar generalParserEnd = [{
+    }
+    if (keyword == "none") {
+      $_storage = std::nullopt;
+      return ::mlir::success();
+    }
+    if (::mlir::failed($_parser.parseLess()))
+      return ::mlir::failure();
+    }] # _makePropStorage<p, "presentVal">.ret # [{
+    auto presentParse = [&](}] # p.storageType # [{& propStorage) -> ::mlir::ParseResult {
+      }] # !subst("$_storage", "propStorage", p.parser) # [{
+      return ::mlir::success();
+    }(presentVal);
+    if (presentParse || $_parser.parseGreater())
+      return ::mlir::failure();
+    $_storage = std::move(presentVal);
+  }];
+  defvar generalParser = generalParserBegin # [{
+    return $_parser.emitError($_parser.getCurrentLocation(), "expected 'none' or 'some<prop>'");
+  }] # generalParserEnd;
+  defvar generalOptionalParser = generalParserBegin # [{
+    return std::nullopt;
+  }] # generalParserEnd;
+
+  let parser = !if(delegatesParsing, delegatedParser, generalParser);
+  let optionalParser = !if(delegatesParsing,
+    delegatedOptionalParser, generalOptionalParser);
+
+  defvar delegatedPrinter = [{
+    [&]() {
+      if (!$_storage.has_value()) {
+        $_printer << "none";
+        return;
+      }
+      }] # !subst("$_storage", "(*($_storage))", p.printer) # [{;
+    }()}];
+  defvar generalPrinter = [{
+      [&]() {
+        if (!$_storage.has_value()) {
+          $_printer << "none";
+          return;
+        }
+        $_printer << "some<";
+        }] # !subst("$_storage", "(*($_storage))", p.printer) # [{;
+        $_printer << ">";
+      }()}];
+  let printer = !if(delegatesParsing, delegatedPrinter, generalPrinter);
+
+  let readFromMlirBytecode = [{
+    bool isPresent = false;
+    if (::mlir::failed($_reader.readBool(isPresent)))
+      return ::mlir::failure();
+    if (!isPresent) {
+      $_storage = std::nullopt;
+      return ::mlir::success();
+    }
+    }] # _makePropStorage<p, "presentVal">.ret # [{
+    auto presentResult = [&](}] # p.storageType # [{& propStorage) -> ::mlir::LogicalResult {
+      }] # !subst("$_storage", "propStorage", p.readFromMlirBytecode) # [{;
+      return ::mlir::success();
+    }(presentVal);
+    if (::mlir::failed(presentResult))
+      return ::mlir::failure();
+    $_storage = std::move(presentVal);
+  }];
+  let writeToMlirBytecode = [{
+    $_writer.writeOwnedBool($_storage.has_value());
+    if (!$_storage.has_value())
+      return;
+  }] # !subst("$_storage", "(*($_storage))", p.writeToMlirBytecode);
+
+  let hashProperty = !if(!empty(p.hashProperty), p.hashProperty,
+    [{ ::llvm::hash_value($_storage.has_value() ? std::optional<::llvm::hash_code>{}] #
+      !subst("$_storage", "(*($_storage))", p.hashProperty) #[{} : std::nullopt) }]);
+  assert !or(!not(delegatesParsing), !eq(defaultValue, "std::nullopt")),
+    "For delegated parsing to be used, the default value must be nullopt. " #
+    "To use a non-trivial default, set the canDelegateParsing argument to 0";
+}
 #endif // PROPERTIES
diff --git a/mlir/include/mlir/TableGen/Operator.h b/mlir/include/mlir/TableGen/Operator.h
index cc5853c..768291a 100644
--- a/mlir/include/mlir/TableGen/Operator.h
+++ b/mlir/include/mlir/TableGen/Operator.h
@@ -384,7 +384,7 @@ private:
   SmallVector<NamedAttribute, 4> attributes;
 
   /// The properties of the op.
-  SmallVector<NamedProperty> properties;
+  SmallVector<NamedProperty, 4> properties;
 
   /// The arguments of the op (operands and native attributes).
   SmallVector<Argument, 4> arguments;
diff --git a/mlir/include/mlir/TableGen/Property.h b/mlir/include/mlir/TableGen/Property.h
index d0d6f49..702e675 100644
--- a/mlir/include/mlir/TableGen/Property.h
+++ b/mlir/include/mlir/TableGen/Property.h
@@ -35,12 +35,20 @@ class Property {
 public:
   explicit Property(const llvm::Record *record);
   explicit Property(const llvm::DefInit *init);
-  Property(StringRef storageType, StringRef interfaceType,
-           StringRef convertFromStorageCall, StringRef assignToStorageCall,
-           StringRef convertToAttributeCall, StringRef convertFromAttributeCall,
+  Property(StringRef summary, StringRef description, StringRef storageType,
+           StringRef interfaceType, StringRef convertFromStorageCall,
+           StringRef assignToStorageCall, StringRef convertToAttributeCall,
+           StringRef convertFromAttributeCall, StringRef parserCall,
+           StringRef optionalParserCall, StringRef printerCall,
            StringRef readFromMlirBytecodeCall,
            StringRef writeToMlirBytecodeCall, StringRef hashPropertyCall,
-           StringRef defaultValue);
+           StringRef defaultValue, StringRef storageTypeValueOverride);
+
+  // Returns the summary (for error messages) of this property's type.
+  StringRef getSummary() const { return summary; }
+
+  // Returns the description of this property.
+  StringRef getDescription() const { return description; }
 
   // Returns the storage type.
   StringRef getStorageType() const { return storageType; }
@@ -66,6 +74,19 @@ public:
     return convertFromAttributeCall;
   }
 
+  // Returns the method call which parses this property from textual MLIR.
+  StringRef getParserCall() const { return parserCall; }
+
+  // Returns true if this property has defined an optional parser.
+  bool hasOptionalParser() const { return !optionalParserCall.empty(); }
+
+  // Returns the method call which optionally parses this property from textual
+  // MLIR.
+  StringRef getOptionalParserCall() const { return optionalParserCall; }
+
+  // Returns the method call which prints this property to textual MLIR.
+  StringRef getPrinterCall() const { return printerCall; }
+
   // Returns the method call which reads this property from
   // bytecode and assign it to the storage.
   StringRef getReadFromMlirBytecodeCall() const {
@@ -87,6 +108,24 @@ public:
   // Returns the default value for this Property.
   StringRef getDefaultValue() const { return defaultValue; }
 
+  // Returns whether this Property has a default storage-type value that is
+  // distinct from its default interface-type value.
+  bool hasStorageTypeValueOverride() const {
+    return !storageTypeValueOverride.empty();
+  }
+
+  StringRef getStorageTypeValueOverride() const {
+    return storageTypeValueOverride;
+  }
+
+  // Returns this property's TableGen def-name.
+  StringRef getPropertyDefName() const;
+
+  // Returns the base-level property that this Property constraint is based on
+  // or the Property itself otherwise. (Note: there are currently no
+  // property constraints, this function is added for future-proofing)
+  Property getBaseProperty() const;
+
   // Returns the TableGen definition this Property was constructed from.
   const llvm::Record &getDef() const { return *def; }
 
@@ -95,16 +134,22 @@ private:
   const llvm::Record *def;
 
   // Elements describing a Property, in general fetched from the record.
+  StringRef summary;
+  StringRef description;
   StringRef storageType;
   StringRef interfaceType;
   StringRef convertFromStorageCall;
   StringRef assignToStorageCall;
   StringRef convertToAttributeCall;
   StringRef convertFromAttributeCall;
+  StringRef parserCall;
+  StringRef optionalParserCall;
+  StringRef printerCall;
   StringRef readFromMlirBytecodeCall;
   StringRef writeToMlirBytecodeCall;
   StringRef hashPropertyCall;
   StringRef defaultValue;
+  StringRef storageTypeValueOverride;
 };
 
 // A struct wrapping an op property and its name together
diff --git a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
index aca25ca..6be5548 100644
--- a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
@@ -9,6 +9,7 @@
 #define MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_
 
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Builders.h"
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
index 004428b..c76d489 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
@@ -774,6 +774,94 @@ struct ConvertIllegalShapeCastOpsToTransposes
   }
 };
 
+/// Returns an iterator over the dims (inc scalability) of a VectorType.
+static auto getDims(VectorType vType) {
+  return llvm::zip_equal(vType.getShape(), vType.getScalableDims());
+}
+
+/// Helper to drop (fixed-size) unit dims from a VectorType.
+static VectorType dropUnitDims(VectorType vType) {
+  SmallVector<bool> scalableFlags;
+  SmallVector<int64_t> dimSizes;
+  for (auto dim : getDims(vType)) {
+    if (dim == std::make_tuple(1, false))
+      continue;
+    auto [size, scalableFlag] = dim;
+    dimSizes.push_back(size);
+    scalableFlags.push_back(scalableFlag);
+  }
+  return VectorType::get(dimSizes, vType.getElementType(), scalableFlags);
+}
+
+/// A pattern to swap shape_cast(tranpose) with transpose(shape_cast) if the
+/// shape_cast only drops unit dimensions.
+///
+/// This simplifies the transpose making it possible for other legalization
+/// rewrites to handle it.
+///
+/// Example:
+///
+///  BEFORE:
+///  ```mlir
+///  %0 = vector.transpose %vector, [3, 0, 1, 2]
+///         : vector<1x1x4x[4]xf32> to vector<[4]x1x1x4xf32>
+///  %1 = vector.shape_cast %0 : vector<[4]x1x1x4xf32> to vector<[4]x4xf32>
+///  ```
+///
+///  AFTER:
+///  ```mlir
+///  %0 = vector.shape_cast %arg0 : vector<1x1x4x[4]xf32> to vector<4x[4]xf32>
+///  %1 = vector.transpose %0, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
+///  ```
+struct SwapShapeCastOfTranspose : public OpRewritePattern<vector::ShapeCastOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::ShapeCastOp shapeCastOp,
+                                PatternRewriter &rewriter) const override {
+    auto transposeOp =
+        shapeCastOp.getSource().getDefiningOp<vector::TransposeOp>();
+    if (!transposeOp)
+      return rewriter.notifyMatchFailure(shapeCastOp, "not TransposeOp");
+
+    auto resultType = shapeCastOp.getResultVectorType();
+    if (resultType.getRank() <= 1)
+      return rewriter.notifyMatchFailure(shapeCastOp, "result rank too low");
+
+    if (resultType != dropUnitDims(shapeCastOp.getSourceVectorType()))
+      return rewriter.notifyMatchFailure(
+          shapeCastOp, "ShapeCastOp changes non-unit dimension(s)");
+
+    auto transposeSourceVectorType = transposeOp.getSourceVectorType();
+    auto transposeSourceDims =
+        llvm::to_vector(getDims(transposeSourceVectorType));
+
+    // Construct a map from dimIdx -> number of dims dropped before dimIdx.
+    SmallVector<int64_t> droppedDimsBefore(transposeSourceVectorType.getRank());
+    int64_t droppedDims = 0;
+    for (auto [i, dim] : llvm::enumerate(transposeSourceDims)) {
+      droppedDimsBefore[i] = droppedDims;
+      if (dim == std::make_tuple(1, false))
+        ++droppedDims;
+    }
+
+    // Drop unit dims from transpose permutation.
+    auto perm = transposeOp.getPermutation();
+    SmallVector<int64_t> newPerm;
+    for (int64_t idx : perm) {
+      if (transposeSourceDims[idx] == std::make_tuple(1, false))
+        continue;
+      newPerm.push_back(idx - droppedDimsBefore[idx]);
+    }
+
+    auto loc = shapeCastOp.getLoc();
+    auto newShapeCastOp = rewriter.create<vector::ShapeCastOp>(
+        loc, dropUnitDims(transposeSourceVectorType), transposeOp.getVector());
+    rewriter.replaceOpWithNewOp<vector::TransposeOp>(shapeCastOp,
+                                                     newShapeCastOp, newPerm);
+    return success();
+  }
+};
+
 /// Rewrites an illegal/unsupported SVE transfer_write(transpose) to instead use
 /// the ZA state. This workaround rewrite to support these transposes when ZA is
 /// available.
@@ -939,7 +1027,8 @@ struct VectorLegalizationPass
     patterns.add<FoldExtractFromVectorOfSMELikeCreateMasks,
                  LiftIllegalVectorTransposeToMemory,
                  ConvertIllegalShapeCastOpsToTransposes,
-                 LowerIllegalTransposeStoreViaZA>(context);
+                 SwapShapeCastOfTranspose, LowerIllegalTransposeStoreViaZA>(
+        context);
     // Note: These two patterns are added with a high benefit to ensure:
     //  - Masked outer products are handled before unmasked ones
     //  - Multi-tile writes are lowered as a store loop (if possible)
diff --git a/mlir/lib/IR/ODSSupport.cpp b/mlir/lib/IR/ODSSupport.cpp
index 6e968d62..d56c75e 100644
--- a/mlir/lib/IR/ODSSupport.cpp
+++ b/mlir/lib/IR/ODSSupport.cpp
@@ -33,6 +33,50 @@ Attribute mlir::convertToAttribute(MLIRContext *ctx, int64_t storage) {
   return IntegerAttr::get(IntegerType::get(ctx, 64), storage);
 }
 
+LogicalResult
+mlir::convertFromAttribute(int32_t &storage, Attribute attr,
+                           function_ref<InFlightDiagnostic()> emitError) {
+  auto valueAttr = dyn_cast<IntegerAttr>(attr);
+  if (!valueAttr) {
+    emitError() << "expected IntegerAttr for key `value`";
+    return failure();
+  }
+  storage = valueAttr.getValue().getSExtValue();
+  return success();
+}
+Attribute mlir::convertToAttribute(MLIRContext *ctx, int32_t storage) {
+  return IntegerAttr::get(IntegerType::get(ctx, 32), storage);
+}
+
+LogicalResult
+mlir::convertFromAttribute(std::string &storage, Attribute attr,
+                           function_ref<InFlightDiagnostic()> emitError) {
+  auto valueAttr = dyn_cast<StringAttr>(attr);
+  if (!valueAttr)
+    return emitError()
+           << "expected string property to come from string attribute";
+  storage = valueAttr.getValue().str();
+  return success();
+}
+Attribute mlir::convertToAttribute(MLIRContext *ctx,
+                                   const std::string &storage) {
+  return StringAttr::get(ctx, storage);
+}
+
+LogicalResult
+mlir::convertFromAttribute(bool &storage, Attribute attr,
+                           function_ref<InFlightDiagnostic()> emitError) {
+  auto valueAttr = dyn_cast<BoolAttr>(attr);
+  if (!valueAttr)
+    return emitError()
+           << "expected string property to come from string attribute";
+  storage = valueAttr.getValue();
+  return success();
+}
+Attribute mlir::convertToAttribute(MLIRContext *ctx, bool storage) {
+  return BoolAttr::get(ctx, storage);
+}
+
 template <typename DenseArrayTy, typename T>
 LogicalResult
 convertDenseArrayFromAttr(MutableArrayRef<T> storage, Attribute attr,
@@ -64,6 +108,33 @@ mlir::convertFromAttribute(MutableArrayRef<int32_t> storage, Attribute attr,
                                                       "DenseI32ArrayAttr");
 }
 
+template <typename DenseArrayTy, typename T>
+LogicalResult
+convertDenseArrayFromAttr(SmallVectorImpl<T> &storage, Attribute attr,
+                          function_ref<InFlightDiagnostic()> emitError,
+                          StringRef denseArrayTyStr) {
+  auto valueAttr = dyn_cast<DenseArrayTy>(attr);
+  if (!valueAttr) {
+    emitError() << "expected " << denseArrayTyStr << " for key `value`";
+    return failure();
+  }
+  storage.resize_for_overwrite(valueAttr.size());
+  llvm::copy(valueAttr.asArrayRef(), storage.begin());
+  return success();
+}
+LogicalResult
+mlir::convertFromAttribute(SmallVectorImpl<int64_t> &storage, Attribute attr,
+                           function_ref<InFlightDiagnostic()> emitError) {
+  return convertDenseArrayFromAttr<DenseI64ArrayAttr>(storage, attr, emitError,
+                                                      "DenseI64ArrayAttr");
+}
+LogicalResult
+mlir::convertFromAttribute(SmallVectorImpl<int32_t> &storage, Attribute attr,
+                           function_ref<InFlightDiagnostic()> emitError) {
+  return convertDenseArrayFromAttr<DenseI32ArrayAttr>(storage, attr, emitError,
+                                                      "DenseI32ArrayAttr");
+}
+
 Attribute mlir::convertToAttribute(MLIRContext *ctx,
                                    ArrayRef<int64_t> storage) {
   return DenseI64ArrayAttr::get(ctx, storage);
diff --git a/mlir/lib/TableGen/Property.cpp b/mlir/lib/TableGen/Property.cpp
index e61d2fd..b86b87df 100644
--- a/mlir/lib/TableGen/Property.cpp
+++ b/mlir/lib/TableGen/Property.cpp
@@ -33,16 +33,23 @@ static StringRef getValueAsString(const Init *init) {
 }
 
 Property::Property(const Record *def)
-    : Property(getValueAsString(def->getValueInit("storageType")),
-               getValueAsString(def->getValueInit("interfaceType")),
-               getValueAsString(def->getValueInit("convertFromStorage")),
-               getValueAsString(def->getValueInit("assignToStorage")),
-               getValueAsString(def->getValueInit("convertToAttribute")),
-               getValueAsString(def->getValueInit("convertFromAttribute")),
-               getValueAsString(def->getValueInit("readFromMlirBytecode")),
-               getValueAsString(def->getValueInit("writeToMlirBytecode")),
-               getValueAsString(def->getValueInit("hashProperty")),
-               getValueAsString(def->getValueInit("defaultValue"))) {
+    : Property(
+          getValueAsString(def->getValueInit("summary")),
+          getValueAsString(def->getValueInit("description")),
+          getValueAsString(def->getValueInit("storageType")),
+          getValueAsString(def->getValueInit("interfaceType")),
+          getValueAsString(def->getValueInit("convertFromStorage")),
+          getValueAsString(def->getValueInit("assignToStorage")),
+          getValueAsString(def->getValueInit("convertToAttribute")),
+          getValueAsString(def->getValueInit("convertFromAttribute")),
+          getValueAsString(def->getValueInit("parser")),
+          getValueAsString(def->getValueInit("optionalParser")),
+          getValueAsString(def->getValueInit("printer")),
+          getValueAsString(def->getValueInit("readFromMlirBytecode")),
+          getValueAsString(def->getValueInit("writeToMlirBytecode")),
+          getValueAsString(def->getValueInit("hashProperty")),
+          getValueAsString(def->getValueInit("defaultValue")),
+          getValueAsString(def->getValueInit("storageTypeValueOverride"))) {
   this->def = def;
   assert((def->isSubClassOf("Property") || def->isSubClassOf("Attr")) &&
          "must be subclass of TableGen 'Property' class");
@@ -50,22 +57,44 @@ Property::Property(const Record *def)
 
 Property::Property(const DefInit *init) : Property(init->getDef()) {}
 
-Property::Property(StringRef storageType, StringRef interfaceType,
+Property::Property(StringRef summary, StringRef description,
+                   StringRef storageType, StringRef interfaceType,
                    StringRef convertFromStorageCall,
                    StringRef assignToStorageCall,
                    StringRef convertToAttributeCall,
-                   StringRef convertFromAttributeCall,
+                   StringRef convertFromAttributeCall, StringRef parserCall,
+                   StringRef optionalParserCall, StringRef printerCall,
                    StringRef readFromMlirBytecodeCall,
                    StringRef writeToMlirBytecodeCall,
-                   StringRef hashPropertyCall, StringRef defaultValue)
-    : storageType(storageType), interfaceType(interfaceType),
+                   StringRef hashPropertyCall, StringRef defaultValue,
+                   StringRef storageTypeValueOverride)
+    : summary(summary), description(description), storageType(storageType),
+      interfaceType(interfaceType),
       convertFromStorageCall(convertFromStorageCall),
       assignToStorageCall(assignToStorageCall),
       convertToAttributeCall(convertToAttributeCall),
       convertFromAttributeCall(convertFromAttributeCall),
+      parserCall(parserCall), optionalParserCall(optionalParserCall),
+      printerCall(printerCall),
       readFromMlirBytecodeCall(readFromMlirBytecodeCall),
       writeToMlirBytecodeCall(writeToMlirBytecodeCall),
-      hashPropertyCall(hashPropertyCall), defaultValue(defaultValue) {
+      hashPropertyCall(hashPropertyCall), defaultValue(defaultValue),
+      storageTypeValueOverride(storageTypeValueOverride) {
   if (storageType.empty())
     storageType = "Property";
 }
+
+StringRef Property::getPropertyDefName() const {
+  if (def->isAnonymous()) {
+    return getBaseProperty().def->getName();
+  }
+  return def->getName();
+}
+
+Property Property::getBaseProperty() const {
+  if (const auto *defInit =
+          llvm::dyn_cast<llvm::DefInit>(def->getValueInit("baseProperty"))) {
+    return Property(defInit).getBaseProperty();
+  }
+  return *this;
+}
diff --git a/mlir/test/Dialect/ArmSME/vector-legalization.mlir b/mlir/test/Dialect/ArmSME/vector-legalization.mlir
index 458906a..adc02ad 100644
--- a/mlir/test/Dialect/ArmSME/vector-legalization.mlir
+++ b/mlir/test/Dialect/ArmSME/vector-legalization.mlir
@@ -646,3 +646,29 @@ func.func @negative_transpose_store_scalable_via_za__bad_source_shape(%vec: vect
   vector.transfer_write %tr, %dest[%i, %j] {in_bounds = [true, true]} : vector<[7]x2xf32>,  memref<?x?xf32>
   return
 }
+
+// -----
+
+// CHECK-LABEL: @swap_shape_cast_of_transpose(
+// CHECK-SAME:                                %[[VEC:.*]]: vector<1x1x4x[4]xf32>)
+func.func @swap_shape_cast_of_transpose(%vector: vector<1x1x4x[4]xf32>) -> vector<[4]x4xf32> {
+  // CHECK: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[VEC]] : vector<1x1x4x[4]xf32> to vector<4x[4]xf32>
+  // CHECK: %[[TRANSPOSE:.*]] = vector.transpose %[[SHAPE_CAST]], [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
+  // CHECK: return %[[TRANSPOSE]]
+  %0 = vector.transpose %vector, [3, 0, 1, 2] : vector<1x1x4x[4]xf32> to vector<[4]x1x1x4xf32>
+  %1 = vector.shape_cast %0 : vector<[4]x1x1x4xf32> to vector<[4]x4xf32>
+  return %1 : vector<[4]x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @swap_shape_cast_of_transpose_units_dims_before_and_after(
+// CHECK-SAME:                                %[[VEC:.*]]: vector<1x1x1x4x[4]x1xf32>)
+func.func @swap_shape_cast_of_transpose_units_dims_before_and_after(%vector: vector<1x1x1x4x[4]x1xf32>) -> vector<[4]x4xf32> {
+  // CHECK: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[VEC]] : vector<1x1x1x4x[4]x1xf32> to vector<4x[4]xf32>
+  // CHECK: %[[TRANSPOSE:.*]] = vector.transpose %[[SHAPE_CAST]], [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
+  // CHECK: return %[[TRANSPOSE]]
+  %0 = vector.transpose %vector, [4, 1, 0, 2, 3, 5] : vector<1x1x1x4x[4]x1xf32> to vector<[4]x1x1x1x4x1xf32>
+  %1 = vector.shape_cast %0 : vector<[4]x1x1x1x4x1xf32> to vector<[4]x4xf32>
+  return %1 : vector<[4]x4xf32>
+}
diff --git a/mlir/test/IR/properties.mlir b/mlir/test/IR/properties.mlir
index 01ea856..418b81d 100644
--- a/mlir/test/IR/properties.mlir
+++ b/mlir/test/IR/properties.mlir
@@ -2,10 +2,10 @@
 // # RUN: mlir-opt %s -mlir-print-op-generic -split-input-file  | mlir-opt -mlir-print-op-generic | FileCheck %s --check-prefix=GENERIC
 
 // CHECK:   test.with_properties
-// CHECK-SAME: <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo"}>{{$}}
+// CHECK-SAME: a = 32, b = "foo", c = "bar", flag = true, array = [1, 2, 3, 4]{{$}}
 // GENERIC:   "test.with_properties"()
-// GENERIC-SAME: <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo"}> : () -> ()
-test.with_properties <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo"}>
+// GENERIC-SAME: <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo", c = "bar", flag = true}> : () -> ()
+test.with_properties a = 32, b = "foo", c = "bar", flag = true, array = [1, 2, 3, 4]
 
 // CHECK:   test.with_nice_properties
 // CHECK-SAME:    "foo bar" is -3{{$}}
@@ -34,18 +34,48 @@ test.using_property_in_custom [1, 4, 20]
 // GENERIC-SAME: }>
 test.using_property_ref_in_custom 1 + 4 = 5
 
-// CHECK:   test.with_default_valued_properties {{$}}
+// CHECK:   test.with_default_valued_properties na{{$}}
 // GENERIC: "test.with_default_valued_properties"()
-// GENERIC-SAME:  <{a = 0 : i32}>
-test.with_default_valued_properties <{a = 0 : i32}>
+// GENERIC-SAME: <{a = 0 : i32, b = "", c = -1 : i32, unit = false}> : () -> ()
+test.with_default_valued_properties 0 "" -1 unit_absent
+
+// CHECK:   test.with_default_valued_properties 1 "foo" 0 unit{{$}}
+// GENERIC: "test.with_default_valued_properties"()
+// GENERIC-SAME: <{a = 1 : i32, b = "foo", c = 0 : i32, unit}> : () -> ()
+test.with_default_valued_properties 1 "foo" 0 unit
 
 // CHECK:   test.with_optional_properties
-// CHECK-SAME:  <{b = 0 : i32}>
+// CHECK-SAME: simple = 0
+// GENERIC: "test.with_optional_properties"()
+// GENERIC-SAME:  <{hasDefault = [], hasUnit = false, longSyntax = [], maybeUnit = [], nested = [], nonTrivialStorage = [], simple = [0]}> : () -> ()
+test.with_optional_properties simple = 0
+
+// CHECK:   test.with_optional_properties{{$}}
 // GENERIC: "test.with_optional_properties"()
-// GENERIC-SAME:  <{b = 0 : i32}>
-test.with_optional_properties <{b = 0 : i32}>
+// GENERIC-SAME: simple = []
+test.with_optional_properties
 
-// CHECK:   test.with_optional_properties {{$}}
+// CHECK:    test.with_optional_properties
+// CHECK-SAME: anAttr = 0 simple = 1 nonTrivialStorage = "foo" hasDefault = some<0> nested = some<1>  longSyntax = some<"bar"> hasUnit maybeUnit = some<unit>
 // GENERIC: "test.with_optional_properties"()
-// GENERIC-SAME:  : () -> ()
+// GENERIC-SAME: <{anAttr = 0 : i32, hasDefault = [0], hasUnit, longSyntax = ["bar"], maybeUnit = [unit], nested = {{\[}}[1]], nonTrivialStorage = ["foo"], simple = [1]}> : () -> ()
 test.with_optional_properties
+  anAttr = 0
+  simple = 1
+  nonTrivialStorage = "foo"
+  hasDefault = some<0>
+  nested = some<1>
+  longSyntax = some<"bar">
+  hasUnit
+  maybeUnit = some<unit>
+
+// CHECK:    test.with_optional_properties
+// CHECK-SAME: nested = some<none>
+// GENERIC: "test.with_optional_properties"()
+// GENERIC-SAME: nested = {{\[}}[]]
+test.with_optional_properties nested = some<none>
+
+// CHECK:    test.with_array_properties
+// CHECK-SAME: ints = [1, 2] strings = ["a", "b"] nested = {{\[}}[1, 2], [3, 4]] opt = [-1, -2] explicitOptions = [none, 0] explicitUnits = [unit, unit_absent]
+// GENERIC: "test.with_array_properties"()
+test.with_array_properties ints = [1, 2] strings = ["a", "b"] nested = [[1, 2], [3, 4]] opt = [-1, -2] explicitOptions = [none, 0] explicitUnits = [unit, unit_absent] [] thats_has_default
diff --git a/mlir/test/IR/traits.mlir b/mlir/test/IR/traits.mlir
index 1e04670..49cfd7e 100644
--- a/mlir/test/IR/traits.mlir
+++ b/mlir/test/IR/traits.mlir
@@ -502,6 +502,25 @@ func.func @succeededOilistTrivial() {
 
 // -----
 
+// CHECK-LABEL: @succeededOilistTrivialProperties
+func.func @succeededOilistTrivialProperties() {
+  // CHECK: test.oilist_with_keywords_only_properties keyword
+  test.oilist_with_keywords_only_properties keyword
+  // CHECK: test.oilist_with_keywords_only_properties otherKeyword
+  test.oilist_with_keywords_only_properties otherKeyword
+  // CHECK: test.oilist_with_keywords_only_properties keyword otherKeyword
+  test.oilist_with_keywords_only_properties keyword otherKeyword
+  // CHECK: test.oilist_with_keywords_only_properties keyword otherKeyword
+  test.oilist_with_keywords_only_properties otherKeyword keyword
+  // CHECK: test.oilist_with_keywords_only_properties thirdKeyword
+  test.oilist_with_keywords_only_properties thirdKeyword
+  // CHECK: test.oilist_with_keywords_only_properties keyword thirdKeyword
+  test.oilist_with_keywords_only_properties keyword thirdKeyword
+  return
+}
+
+// -----
+
 // CHECK-LABEL: @succeededOilistSimple
 func.func @succeededOilistSimple(%arg0 : i32, %arg1 : i32, %arg2 : i32) {
   // CHECK: test.oilist_with_simple_args keyword %{{.*}} : i32
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index 7a7af2b..a789ab9 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -408,10 +408,10 @@ func.func @test_move_op_before_rollback() {
 
 // CHECK-LABEL: func @test_properties_rollback()
 func.func @test_properties_rollback() {
-  // CHECK: test.with_properties <{a = 32 : i64,
+  // CHECK: test.with_properties a = 32,
   // expected-remark @below{{op 'test.with_properties' is not legalizable}}
   test.with_properties
-      <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo"}>
+      a = 32, b = "foo", c = "bar", flag = true, array = [1, 2, 3, 4]
       {modify_inplace}
   "test.return"() : () -> ()
 }
diff --git a/mlir/test/lib/Dialect/Test/TestFormatUtils.cpp b/mlir/test/lib/Dialect/Test/TestFormatUtils.cpp
index 6e75dd3..9ed1b3a 100644
--- a/mlir/test/lib/Dialect/Test/TestFormatUtils.cpp
+++ b/mlir/test/lib/Dialect/Test/TestFormatUtils.cpp
@@ -297,11 +297,17 @@ void test::printSwitchCases(OpAsmPrinter &p, Operation *op,
 // CustomUsingPropertyInCustom
 //===----------------------------------------------------------------------===//
 
-bool test::parseUsingPropertyInCustom(OpAsmParser &parser, int64_t value[3]) {
-  return parser.parseLSquare() || parser.parseInteger(value[0]) ||
-         parser.parseComma() || parser.parseInteger(value[1]) ||
-         parser.parseComma() || parser.parseInteger(value[2]) ||
-         parser.parseRSquare();
+bool test::parseUsingPropertyInCustom(OpAsmParser &parser,
+                                      SmallVector<int64_t> &value) {
+  auto elemParser = [&]() {
+    int64_t v = 0;
+    if (failed(parser.parseInteger(v)))
+      return failure();
+    value.push_back(v);
+    return success();
+  };
+  return failed(parser.parseCommaSeparatedList(OpAsmParser::Delimiter::Square,
+                                               elemParser));
 }
 
 void test::printUsingPropertyInCustom(OpAsmPrinter &printer, Operation *op,
diff --git a/mlir/test/lib/Dialect/Test/TestFormatUtils.h b/mlir/test/lib/Dialect/Test/TestFormatUtils.h
index 7e9cd83..6d4df7d 100644
--- a/mlir/test/lib/Dialect/Test/TestFormatUtils.h
+++ b/mlir/test/lib/Dialect/Test/TestFormatUtils.h
@@ -160,7 +160,8 @@ void printSwitchCases(mlir::OpAsmPrinter &p, mlir::Operation *op,
 // CustomUsingPropertyInCustom
 //===----------------------------------------------------------------------===//
 
-bool parseUsingPropertyInCustom(mlir::OpAsmParser &parser, int64_t value[3]);
+bool parseUsingPropertyInCustom(mlir::OpAsmParser &parser,
+                                llvm::SmallVector<int64_t> &value);
 
 void printUsingPropertyInCustom(mlir::OpAsmPrinter &printer,
                                 mlir::Operation *op,
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 9450764..2d97a02 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -2947,11 +2947,18 @@ def TestVersionedOpC : TEST_Op<"versionedC"> {
 
 // Op with a properties struct defined inline.
 def TestOpWithProperties : TEST_Op<"with_properties"> {
-  let assemblyFormat = "prop-dict attr-dict";
+  let assemblyFormat = [{
+    `a` `=` $a `,`
+    `b` `=` $b `,`
+    `c` `=` $c `,`
+    `flag` `=` $flag `,`
+    `array` `=` $array attr-dict}];
   let arguments = (ins
-    IntProperty<"int64_t">:$a,
+    I64Property:$a,
     StrAttr:$b, // Attributes can directly be used here.
-    ArrayProperty<"int64_t", 4>:$array // example of an array
+    StringProperty:$c,
+    BoolProperty:$flag,
+    IntArrayProperty<"int64_t">:$array // example of an array
   );
 }
 
@@ -2974,7 +2981,7 @@ def TestOpWithPropertiesAndInferredType
 
 // Demonstrate how to wrap an existing C++ class named MyPropStruct.
 def MyStructProperty : Property<"MyPropStruct"> {
-  let convertToAttribute = "$_storage.asAttribute($_ctxt)";
+  let convertToAttribute = "return $_storage.asAttribute($_ctxt);";
   let convertFromAttribute = "return MyPropStruct::setFromAttr($_storage, $_attr, $_diag);";
   let hashProperty = "$_storage.hash();";
 }
@@ -2988,14 +2995,14 @@ def TestOpWithWrappedProperties : TEST_Op<"with_wrapped_properties"> {
 
 def TestOpUsingPropertyInCustom : TEST_Op<"using_property_in_custom"> {
   let assemblyFormat = "custom<UsingPropertyInCustom>($prop) attr-dict";
-  let arguments = (ins ArrayProperty<"int64_t", 3>:$prop);
+  let arguments = (ins IntArrayProperty<"int64_t">:$prop);
 }
 
 def TestOpUsingPropertyInCustomAndOther
   : TEST_Op<"using_property_in_custom_and_other"> {
   let assemblyFormat = "custom<UsingPropertyInCustom>($prop) prop-dict attr-dict";
   let arguments = (ins
-    ArrayProperty<"int64_t", 3>:$prop,
+    IntArrayProperty<"int64_t">:$prop,
     IntProperty<"int64_t">:$other
   );
 }
@@ -3021,7 +3028,7 @@ def TestOpUsingIntPropertyWithWorseBytecode
 
 def PropertiesWithCustomPrint : Property<"PropertiesWithCustomPrint"> {
   let convertToAttribute = [{
-    getPropertiesAsAttribute($_ctxt, $_storage)
+    return getPropertiesAsAttribute($_ctxt, $_storage);
   }];
   let convertFromAttribute = [{
     return setPropertiesFromAttribute($_storage, $_attr, $_diag);
@@ -3085,7 +3092,7 @@ def TestOpWithNiceProperties : TEST_Op<"with_nice_properties"> {
 
 def VersionedProperties : Property<"VersionedProperties"> {
   let convertToAttribute = [{
-      getPropertiesAsAttribute($_ctxt, $_storage)
+    return getPropertiesAsAttribute($_ctxt, $_storage);
   }];
   let convertFromAttribute = [{
     return setPropertiesFromAttribute($_storage, $_attr, $_diag);
@@ -3131,13 +3138,65 @@ def TestOpWithVersionedProperties : TEST_Op<"with_versioned_properties"> {
 }
 
 def TestOpWithDefaultValuedProperties : TEST_Op<"with_default_valued_properties"> {
-  let assemblyFormat = "prop-dict attr-dict";
-  let arguments = (ins DefaultValuedAttr<I32Attr, "0">:$a);
+  let assemblyFormat = [{
+    ($a^) : (`na`)?
+    ($b^)?
+    ($c^)?
+    ($unit^)?
+    attr-dict
+  }];
+  let arguments = (ins DefaultValuedAttr<I32Attr, "0">:$a,
+    DefaultValuedProperty<StringProperty, "\"\"">:$b,
+    DefaultValuedProperty<IntProperty<"int32_t">, "-1">:$c,
+    UnitProperty:$unit);
 }
 
 def TestOpWithOptionalProperties : TEST_Op<"with_optional_properties"> {
-  let assemblyFormat = "prop-dict attr-dict";
-  let arguments = (ins OptionalAttr<I32Attr>:$a, OptionalAttr<I32Attr>:$b);
+  let assemblyFormat = [{
+    (`anAttr` `=` $anAttr^)?
+    (`simple` `=` $simple^)?
+    (`nonTrivialStorage` `=` $nonTrivialStorage^)?
+    (`hasDefault` `=` $hasDefault^)?
+    (`nested` `=` $nested^)?
+    (`longSyntax` `=` $longSyntax^)?
+    (`hasUnit` $hasUnit^)?
+    (`maybeUnit` `=` $maybeUnit^)?
+    attr-dict
+  }];
+  let arguments = (ins
+    OptionalAttr<I32Attr>:$anAttr,
+    OptionalProperty<I64Property>:$simple,
+    OptionalProperty<StringProperty>:$nonTrivialStorage,
+    // Confirm that properties with default values now default to nullopt and have
+    // the long syntax.
+    OptionalProperty<DefaultValuedProperty<I64Property, "0">>:$hasDefault,
+    OptionalProperty<OptionalProperty<I64Property>>:$nested,
+    OptionalProperty<StringProperty, 0>:$longSyntax,
+    UnitProperty:$hasUnit,
+    OptionalProperty<UnitProperty>:$maybeUnit);
+}
+
+def TestOpWithArrayProperties : TEST_Op<"with_array_properties"> {
+  let assemblyFormat = [{
+    `ints` `=` $ints
+    `strings` `=` $strings
+    `nested` `=` $nested
+    `opt` `=` $opt
+    `explicitOptions` `=` $explicitOptions
+    `explicitUnits` `=` $explicitUnits
+    ($hasDefault^ `thats_has_default`)?
+    attr-dict
+  }];
+  let arguments = (ins
+    ArrayProperty<I64Property>:$ints,
+    ArrayProperty<StringProperty>:$strings,
+    ArrayProperty<ArrayProperty<I32Property>>:$nested,
+    OptionalProperty<ArrayProperty<I32Property>>:$opt,
+    ArrayProperty<OptionalProperty<I64Property>>:$explicitOptions,
+    ArrayProperty<UnitProperty>:$explicitUnits,
+    DefaultValuedProperty<ArrayProperty<I64Property>,
+      "::llvm::ArrayRef<int64_t>{}", "::llvm::SmallVector<int64_t>{}">:$hasDefault
+  );
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestOpsSyntax.td b/mlir/test/lib/Dialect/Test/TestOpsSyntax.td
index 3129085..795b9da 100644
--- a/mlir/test/lib/Dialect/Test/TestOpsSyntax.td
+++ b/mlir/test/lib/Dialect/Test/TestOpsSyntax.td
@@ -86,6 +86,17 @@ def OIListTrivial : TEST_Op<"oilist_with_keywords_only"> {
   }];
 }
 
+// Ops related to OIList primitive
+def OIListTrivialProperties : TEST_Op<"oilist_with_keywords_only_properties"> {
+  let arguments = (ins UnitProperty:$keyword, UnitProperty:$otherKeyword,
+                       UnitProperty:$diffNameUnitPropertyKeyword);
+  let assemblyFormat = [{
+    oilist( `keyword` $keyword
+          | `otherKeyword` $otherKeyword
+          | `thirdKeyword` $diffNameUnitPropertyKeyword) attr-dict
+  }];
+}
+
 def OIListSimple : TEST_Op<"oilist_with_simple_args", [AttrSizedOperandSegments]> {
   let arguments = (ins Optional<AnyType>:$arg0,
                        Optional<AnyType>:$arg1,
@@ -392,6 +403,17 @@ def FormatOptionalUnitAttrNoElide
   let assemblyFormat = "($is_optional^)? attr-dict";
 }
 
+def FormatOptionalUnitProperty : TEST_Op<"format_optional_unit_property"> {
+  let arguments = (ins UnitProperty:$is_optional);
+  let assemblyFormat = "(`is_optional` $is_optional^)? attr-dict";
+}
+
+def FormatOptionalUnitPropertyNoElide
+    : TEST_Op<"format_optional_unit_property_no_elide"> {
+  let arguments = (ins UnitProperty:$is_optional);
+  let assemblyFormat = "($is_optional^)? attr-dict";
+}
+
 def FormatOptionalEnumAttr : TEST_Op<"format_optional_enum_attr"> {
   let arguments = (ins OptionalAttr<SomeI64Enum>:$attr);
   let assemblyFormat = "($attr^)? attr-dict";
diff --git a/mlir/test/mlir-tblgen/op-format.mlir b/mlir/test/mlir-tblgen/op-format.mlir
index 46d27264..03288ae 100644
--- a/mlir/test/mlir-tblgen/op-format.mlir
+++ b/mlir/test/mlir-tblgen/op-format.mlir
@@ -195,6 +195,16 @@ test.format_optional_unit_attribute
 // CHECK: test.format_optional_unit_attribute_no_elide unit
 test.format_optional_unit_attribute_no_elide unit
 
+// CHECK: test.format_optional_unit_property is_optional
+test.format_optional_unit_property is_optional
+
+// CHECK: test.format_optional_unit_property
+// CHECK-NOT: is_optional
+test.format_optional_unit_property
+
+// CHECK: test.format_optional_unit_property_no_elide unit
+test.format_optional_unit_property_no_elide unit
+
 // CHECK: test.format_optional_enum_attr case5
 test.format_optional_enum_attr case5
 
diff --git a/mlir/test/mlir-tblgen/op-format.td b/mlir/test/mlir-tblgen/op-format.td
index 4a19ffb..8af4341 100644
--- a/mlir/test/mlir-tblgen/op-format.td
+++ b/mlir/test/mlir-tblgen/op-format.td
@@ -73,7 +73,7 @@ def OptionalGroupA : TestFormat_Op<[{
 // CHECK-NEXT: result.addAttribute("a", parser.getBuilder().getUnitAttr())
 // CHECK: parser.parseKeyword("bar")
 // CHECK-LABEL: OptionalGroupB::print
-// CHECK: if (!getAAttr())
+// CHECK: if (!(getAAttr() && getAAttr() != ((false) ? ::mlir::OpBuilder((*this)->getContext()).getUnitAttr() : nullptr)))
 // CHECK-NEXT: odsPrinter << ' ' << "foo"
 // CHECK-NEXT: else
 // CHECK-NEXT: odsPrinter << ' ' << "bar"
@@ -84,7 +84,7 @@ def OptionalGroupB : TestFormat_Op<[{
 // Optional group anchored on a default-valued attribute:
 // CHECK-LABEL: OptionalGroupC::parse
 
-//       CHECK: if (getAAttr() && getAAttr() != ::mlir::OpBuilder((*this)->getContext()).getStringAttr("default")) {
+//       CHECK: if (getAAttr() != ::mlir::OpBuilder((*this)->getContext()).getStringAttr("default")) {
 //  CHECK-NEXT:   odsPrinter << ' ';
 //  CHECK-NEXT:   odsPrinter.printAttributeWithoutType(getAAttr());
 //  CHECK-NEXT: }
diff --git a/mlir/test/mlir-tblgen/op-properties.td b/mlir/test/mlir-tblgen/op-properties.td
index 7b0ee6b..918583c 100644
--- a/mlir/test/mlir-tblgen/op-properties.td
+++ b/mlir/test/mlir-tblgen/op-properties.td
@@ -1,8 +1,10 @@
-// RUN: mlir-tblgen -gen-op-decls -I %S/../../include %s | FileCheck %s
+// RUN: mlir-tblgen -gen-op-decls -I %S/../../include %s | FileCheck %s --check-prefix=DECL
+// RUN: mlir-tblgen -gen-op-defs -I %S/../../include %s | FileCheck %s --check-prefix=DEFS
 
 include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/EnumAttr.td"
 include "mlir/IR/OpBase.td"
+include "mlir/IR/Properties.td"
 
 def Test_Dialect : Dialect {
   let name = "test";
@@ -15,7 +17,115 @@ def OpWithAttr : NS_Op<"op_with_attr">{
   let arguments = (ins AnyAttr:$attr, OptionalAttr<AnyAttr>:$optional);
 }
 
-// CHECK: void setAttrAttr(::mlir::Attribute attr)
-// CHECK-NEXT: getProperties().attr = attr
-// CHECK: void setOptionalAttr(::mlir::Attribute attr)
-// CHECK-NEXT: getProperties().optional = attr
+// Test required and optional properties
+// ---
+
+def DefaultI64Array : IntArrayProperty<"int64_t"> {
+  let defaultValue = "::llvm::ArrayRef<int64_t>{}";
+  let storageTypeValueOverride = "::llvm::SmallVector<int64_t>{}";
+}
+
+def OpWithProps : NS_Op<"op_with_props"> {
+  let arguments = (ins
+    BoolProperty:$flag,
+    StringProperty:$string,
+    ArrayProperty<StringProperty>:$strings,
+    DefaultValuedProperty<I32Property, "0">:$default_int,
+    OptionalProperty<I64Property>:$optional,
+    DefaultI64Array:$intArray
+  );
+}
+
+/// Check that optional arguments to builders only go at the end.
+def OpWithSomeOptionalProperties : NS_Op<"op_with_some_optional_props"> {
+  let arguments = (ins
+    OptionalProperty<I64Property>:$mustSpecify,
+    I64Property:$required,
+    OptionalProperty<StringProperty>:$canOmit,
+    DefaultValuedProperty<I64Property, "-1">:$canOmit2
+  );
+}
+
+/// Check that the ambiguous attribute protection correctly stops optional properties
+/// from getting default argument values in builders.
+def OpWithOptionalPropsAndAttrs :
+    NS_Op<"with_some_optional_props_and_atts"> {
+  let arguments = (ins
+    OptionalProperty<BoolProperty>:$mustSpecify,
+    OptionalAttr<BoolAttr>:$ambiguous,
+    OptionalAttr<I32Attr>:$canOmit,
+    OptionalProperty<I32Property>:$canOmitProp
+  );
+}
+
+// DECL: void setAttrAttr(::mlir::Attribute attr)
+// DECL-NEXT: getProperties().attr = attr
+// DECL: void setOptionalAttr(::mlir::Attribute attr)
+// DECL-NEXT: getProperties().optional = attr
+
+// -----
+
+// DECL-LABEL: class OpWithOptionalPropsAndAttrs :
+// DECL: static void build(
+// DECL-SAME: ::mlir::OpBuilder &odsBuilder,
+// DECL-SAME: ::mlir::OperationState &odsState,
+// DECL-SAME: /*optional*/std::optional<bool> mustSpecify,
+// DECL-SAME: /*optional*/::mlir::BoolAttr ambiguous,
+// DECL-SAME: /*optional*/::mlir::IntegerAttr canOmit,
+// DECL-SAME: /*optional*/std::optional<int32_t> canOmitProp = std::nullopt);
+
+// -----
+
+// COM: Ensure the struct is set up how we expect
+// DECL-LABEL: class OpWithPropsGenericAdaptorBase
+// DECL: using flagTy = bool;
+// DECL-NEXT: flagTy flag;
+// DECL-NEXT: bool getFlag()
+// DECL-NEXT: propStorage = this->flag
+// DECL-NEXT: return propStorage;
+// DECL: void setFlag(bool propValue)
+// DECL-NEXT: propStorage = this->flag;
+// DECL-NEXT: propStorage = propValue;
+// DECL: using stringTy = std::string;
+// DECL: llvm::StringRef getString()
+// DECL: auto &propStorage = this->string;
+// DECL-NEXT: return ::llvm::StringRef{propStorage};
+// DECL: using stringsTy = ::llvm::SmallVector<std::string>
+// DECL: ::llvm::ArrayRef<std::string> getStrings()
+// DECL: using default_intTy = int32_t;
+// DECL: default_intTy default_int = 0;
+// DECL: intArrayTy intArray = ::llvm::SmallVector<int64_t>{};
+// DECL: ::llvm::ArrayRef<int64_t> getIntArray()
+// DECL: return ::llvm::ArrayRef<int64_t>{propStorage}
+// DECL: void setIntArray(::llvm::ArrayRef<int64_t> propValue)
+// DECL: propStorage.assign
+// DECL-LABEL: class OpWithProps :
+// DECL: setString(::llvm::StringRef newString)
+// DECL-NEXT: getProperties().setString(newString)
+
+// DECL: static void build(
+// DECL-SAME: ::mlir::OpBuilder &odsBuilder,
+// DECL-SAME: ::mlir::OperationState &odsState,
+// DECL-SAME: bool flag,
+// DECL-SAME: ::llvm::StringRef string,
+// DECL-SAME: ::llvm::ArrayRef<std::string> strings,
+// DECL-SAME: /*optional*/int32_t default_int = 0,
+// DECL-SAME: /*optional*/std::optional<int64_t> optional = std::nullopt,
+// DECL-SAME: /*optional*/::llvm::ArrayRef<int64_t> intArray = ::llvm::ArrayRef<int64_t>{});
+
+// DEFS-LABEL: OpWithProps::computePropertiesHash
+// DEFS: hash_intArray
+// DEFS-NEXT: return ::llvm::hash_value(::llvm::ArrayRef<int64_t>{propStorage})
+// DEFS: ::llvm::hash_value(prop.optional)
+// DEFS: hash_intArray(prop.intArray)
+
+// -----
+
+// DECL-LABEL: class OpWithSomeOptionalProperties :
+// DECL: static void build(
+// DECL-SAME: ::mlir::OpBuilder &odsBuilder,
+// DECL-SAME: ::mlir::OperationState &odsState,
+// DECL-SAME: /*optional*/std::optional<int64_t> mustSpecify,
+// DECL-SAME: int64_t required,
+// DECL-SAME: /*optional*/std::optional<::llvm::StringRef> canOmit = std::nullopt,
+// DECL-SAME: /*optional*/int64_t canOmit2 = -1);
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 0fc750c..a2ceefb 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -155,6 +155,36 @@ static const char *const valueRangeReturnCode = R"(
            std::next({0}, valueRange.first + valueRange.second)};
 )";
 
+/// Parse operand/result segment_size property.
+/// {0}: Number of elements in the segment array
+static const char *const parseTextualSegmentSizeFormat = R"(
+  size_t i = 0;
+  auto parseElem = [&]() -> ::mlir::ParseResult {
+    if (i >= {0})
+      return $_parser.emitError($_parser.getCurrentLocation(),
+        "expected `]` after {0} segment sizes");
+    if (failed($_parser.parseInteger($_storage[i])))
+      return ::mlir::failure();
+    i += 1;
+    return ::mlir::success();
+  };
+  if (failed($_parser.parseCommaSeparatedList(
+      ::mlir::AsmParser::Delimeter::Square, parseElem)))
+    return failure();
+  if (i < {0})
+    return $_parser.emitError($_parser.getCurrentLocation(),
+      "expected {0} segment sizes, found only ") << i;
+  return success();
+)";
+
+static const char *const printTextualSegmentSize = R"(
+  [&]() {
+    $_printer << '[';
+    ::llvm::interleaveComma($_storage, $_printer);
+    $_printer << ']';
+  }()
+)";
+
 /// Read operand/result segment_size from bytecode.
 static const char *const readBytecodeSegmentSizeNative = R"(
   if ($_reader.getBytecodeVersion() >= /*kNativePropertiesODSSegmentSize=*/6)
@@ -422,8 +452,10 @@ private:
   // Property
   std::optional<NamedProperty> operandSegmentsSize;
   std::string operandSegmentsSizeStorage;
+  std::string operandSegmentsSizeParser;
   std::optional<NamedProperty> resultSegmentsSize;
   std::string resultSegmentsSizeStorage;
+  std::string resultSegmentsSizeParser;
 
   // Indices to store the position in the emission order of the operand/result
   // segment sizes attribute if emitted as part of the properties for legacy
@@ -448,31 +480,40 @@ void OpOrAdaptorHelper::computeAttrMetadata() {
         {namedAttr.name, AttributeMetadata{namedAttr.name, !isOptional, attr}});
   }
 
-  auto makeProperty = [&](StringRef storageType) {
+  auto makeProperty = [&](StringRef storageType, StringRef parserCall) {
     return Property(
+        /*summary=*/"",
+        /*description=*/"",
         /*storageType=*/storageType,
         /*interfaceType=*/"::llvm::ArrayRef<int32_t>",
         /*convertFromStorageCall=*/"$_storage",
         /*assignToStorageCall=*/
         "::llvm::copy($_value, $_storage.begin())",
         /*convertToAttributeCall=*/
-        "::mlir::DenseI32ArrayAttr::get($_ctxt, $_storage)",
+        "return ::mlir::DenseI32ArrayAttr::get($_ctxt, $_storage);",
         /*convertFromAttributeCall=*/
         "return convertFromAttribute($_storage, $_attr, $_diag);",
+        /*parserCall=*/parserCall,
+        /*optionalParserCall=*/"",
+        /*printerCall=*/printTextualSegmentSize,
         /*readFromMlirBytecodeCall=*/readBytecodeSegmentSizeNative,
         /*writeToMlirBytecodeCall=*/writeBytecodeSegmentSizeNative,
         /*hashPropertyCall=*/
         "::llvm::hash_combine_range(std::begin($_storage), "
         "std::end($_storage));",
-        /*StringRef defaultValue=*/"");
+        /*StringRef defaultValue=*/"",
+        /*storageTypeValueOverride=*/"");
   };
   // Include key attributes from several traits as implicitly registered.
   if (op.getTrait("::mlir::OpTrait::AttrSizedOperandSegments")) {
     if (op.getDialect().usePropertiesForAttributes()) {
       operandSegmentsSizeStorage =
           llvm::formatv("std::array<int32_t, {0}>", op.getNumOperands());
-      operandSegmentsSize = {"operandSegmentSizes",
-                             makeProperty(operandSegmentsSizeStorage)};
+      operandSegmentsSizeParser =
+          llvm::formatv(parseTextualSegmentSizeFormat, op.getNumOperands());
+      operandSegmentsSize = {
+          "operandSegmentSizes",
+          makeProperty(operandSegmentsSizeStorage, operandSegmentsSizeParser)};
     } else {
       attrMetadata.insert(
           {operandSegmentAttrName, AttributeMetadata{operandSegmentAttrName,
@@ -484,8 +525,11 @@ void OpOrAdaptorHelper::computeAttrMetadata() {
     if (op.getDialect().usePropertiesForAttributes()) {
       resultSegmentsSizeStorage =
           llvm::formatv("std::array<int32_t, {0}>", op.getNumResults());
-      resultSegmentsSize = {"resultSegmentSizes",
-                            makeProperty(resultSegmentsSizeStorage)};
+      resultSegmentsSizeParser =
+          llvm::formatv(parseTextualSegmentSizeFormat, op.getNumResults());
+      resultSegmentsSize = {
+          "resultSegmentSizes",
+          makeProperty(resultSegmentsSizeStorage, resultSegmentsSizeParser)};
     } else {
       attrMetadata.insert(
           {resultSegmentAttrName,
@@ -572,6 +616,12 @@ private:
   void
   genPropertiesSupportForBytecode(ArrayRef<ConstArgument> attrOrProperties);
 
+  // Generates getters for the properties.
+  void genPropGetters();
+
+  // Generates seters for the properties.
+  void genPropSetters();
+
   // Generates getters for the attributes.
   void genAttrGetters();
 
@@ -1041,6 +1091,8 @@ OpEmitter::OpEmitter(const Operator &op,
   genNamedRegionGetters();
   genNamedSuccessorGetters();
   genPropertiesSupport();
+  genPropGetters();
+  genPropSetters();
   genAttrGetters();
   genAttrSetters();
   genOptionalAttrRemovers();
@@ -1198,6 +1250,16 @@ void OpEmitter::genAttrNameGetters() {
   }
 }
 
+// Emit the getter for a named property.
+// It is templated to be shared between the Op and the adaptor class.
+template <typename OpClassOrAdaptor>
+static void emitPropGetter(OpClassOrAdaptor &opClass, const Operator &op,
+                           StringRef name, const Property &prop) {
+  auto *method = opClass.addInlineMethod(prop.getInterfaceType(), name);
+  ERROR_IF_PRUNED(method, name, op);
+  method->body() << formatv("  return getProperties().{0}();", name);
+}
+
 // Emit the getter for an attribute with the return type specified.
 // It is templated to be shared between the Op and the adaptor class.
 template <typename OpClassOrAdaptor>
@@ -1313,7 +1375,7 @@ void OpEmitter::genPropertiesSupport() {
     )decl";
   const char *propFromAttrFmt = R"decl(
       auto setFromAttr = [] (auto &propStorage, ::mlir::Attribute propAttr,
-               ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) {{
+               ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) -> ::mlir::LogicalResult {{
         {0}
       };
       {2};
@@ -1358,7 +1420,10 @@ void OpEmitter::genPropertiesSupport() {
                                           .addSubst("_storage", propertyStorage)
                                           .addSubst("_diag", propertyDiag)),
                                name, getAttr);
-      if (prop.hasDefaultValue()) {
+      if (prop.hasStorageTypeValueOverride()) {
+        setPropMethod << formatv(attrGetDefaultFmt, name,
+                                 prop.getStorageTypeValueOverride());
+      } else if (prop.hasDefaultValue()) {
         setPropMethod << formatv(attrGetDefaultFmt, name,
                                  prop.getDefaultValue());
       } else {
@@ -1409,8 +1474,10 @@ void OpEmitter::genPropertiesSupport() {
   const char *propToAttrFmt = R"decl(
     {
       const auto &propStorage = prop.{0};
-      attrs.push_back(odsBuilder.getNamedAttr("{0}",
-                                              {1}));
+      auto attr = [&]() -> ::mlir::Attribute {{
+        {1}
+      }();
+      attrs.push_back(odsBuilder.getNamedAttr("{0}", attr));
     }
 )decl";
   for (const auto &attrOrProp : attrOrProperties) {
@@ -1458,9 +1525,12 @@ void OpEmitter::genPropertiesSupport() {
       StringRef name = namedProperty->name;
       auto &prop = namedProperty->prop;
       FmtContext fctx;
-      hashMethod << formatv(propHashFmt, name,
-                            tgfmt(prop.getHashPropertyCall(),
-                                  &fctx.addSubst("_storage", propertyStorage)));
+      if (!prop.getHashPropertyCall().empty()) {
+        hashMethod << formatv(
+            propHashFmt, name,
+            tgfmt(prop.getHashPropertyCall(),
+                  &fctx.addSubst("_storage", propertyStorage)));
+      }
     }
   }
   hashMethod << "  return llvm::hash_combine(";
@@ -1468,8 +1538,13 @@ void OpEmitter::genPropertiesSupport() {
       attrOrProperties, hashMethod, [&](const ConstArgument &attrOrProp) {
         if (const auto *namedProperty =
                 llvm::dyn_cast_if_present<const NamedProperty *>(attrOrProp)) {
-          hashMethod << "\n    hash_" << namedProperty->name << "(prop."
-                     << namedProperty->name << ")";
+          if (!namedProperty->prop.getHashPropertyCall().empty()) {
+            hashMethod << "\n    hash_" << namedProperty->name << "(prop."
+                       << namedProperty->name << ")";
+          } else {
+            hashMethod << "\n    ::llvm::hash_value(prop."
+                       << namedProperty->name << ")";
+          }
           return;
         }
         const auto *namedAttr =
@@ -1524,8 +1599,9 @@ void OpEmitter::genPropertiesSupport() {
                      "\"{0}\") return ",
                      resultSegmentAttrName);
     }
-    getInherentAttrMethod << tgfmt(prop.getConvertToAttributeCall(), &fctx)
-                          << ";\n";
+    getInherentAttrMethod << "[&]() -> ::mlir::Attribute { "
+                          << tgfmt(prop.getConvertToAttributeCall(), &fctx)
+                          << " }();\n";
 
     if (name == operandSegmentAttrName) {
       setInherentAttrMethod
@@ -1549,13 +1625,15 @@ void OpEmitter::genPropertiesSupport() {
 )decl",
                                      name);
     if (name == operandSegmentAttrName) {
-      populateInherentAttrsMethod
-          << formatv("  attrs.append(\"{0}\", {1});\n", operandSegmentAttrName,
-                     tgfmt(prop.getConvertToAttributeCall(), &fctx));
+      populateInherentAttrsMethod << formatv(
+          "  attrs.append(\"{0}\", [&]() -> ::mlir::Attribute { {1} }());\n",
+          operandSegmentAttrName,
+          tgfmt(prop.getConvertToAttributeCall(), &fctx));
     } else {
-      populateInherentAttrsMethod
-          << formatv("  attrs.append(\"{0}\", {1});\n", resultSegmentAttrName,
-                     tgfmt(prop.getConvertToAttributeCall(), &fctx));
+      populateInherentAttrsMethod << formatv(
+          "  attrs.append(\"{0}\", [&]() -> ::mlir::Attribute { {1} }());\n",
+          resultSegmentAttrName,
+          tgfmt(prop.getConvertToAttributeCall(), &fctx));
     }
   }
   getInherentAttrMethod << "  return std::nullopt;\n";
@@ -1701,6 +1779,26 @@ void OpEmitter::genPropertiesSupportForBytecode(
   readPropertiesMethod << "  return ::mlir::success();";
 }
 
+void OpEmitter::genPropGetters() {
+  for (const NamedProperty &prop : op.getProperties()) {
+    std::string name = op.getGetterName(prop.name);
+    emitPropGetter(opClass, op, name, prop.prop);
+  }
+}
+
+void OpEmitter::genPropSetters() {
+  for (const NamedProperty &prop : op.getProperties()) {
+    std::string name = op.getSetterName(prop.name);
+    std::string argName = "new" + convertToCamelFromSnakeCase(
+                                      prop.name, /*capitalizeFirst=*/true);
+    auto *method = opClass.addInlineMethod(
+        "void", name, MethodParameter(prop.prop.getInterfaceType(), argName));
+    if (!method)
+      return;
+    method->body() << formatv("  getProperties().{0}({1});", name, argName);
+  }
+}
+
 void OpEmitter::genAttrGetters() {
   FmtContext fctx;
   fctx.withBuilder("::mlir::Builder((*this)->getContext())");
@@ -2957,6 +3055,12 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
   }
 
   // Add parameters for all arguments (operands and attributes).
+  // Track "attr-like" (property and attribute) optional values separate from
+  // attributes themselves so that the disambiguation code can look at the first
+  // attribute specifically when determining where to trim the optional-value
+  // list to avoid ambiguity while preserving the ability of all-property ops to
+  // use default parameters.
+  int defaultValuedAttrLikeStartIndex = op.getNumArgs();
   int defaultValuedAttrStartIndex = op.getNumArgs();
   // Successors and variadic regions go at the end of the parameter list, so no
   // default arguments are possible.
@@ -2967,6 +3071,15 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
     for (int i = op.getNumArgs() - 1; i >= 0; --i) {
       auto *namedAttr =
           llvm::dyn_cast_if_present<tblgen::NamedAttribute *>(op.getArg(i));
+      auto *namedProperty =
+          llvm::dyn_cast_if_present<tblgen::NamedProperty *>(op.getArg(i));
+      if (namedProperty) {
+        Property prop = namedProperty->prop;
+        if (!prop.hasDefaultValue())
+          break;
+        defaultValuedAttrLikeStartIndex = i;
+        continue;
+      }
       if (!namedAttr)
         break;
 
@@ -2986,6 +3099,7 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
       if (retType == "::llvm::APInt" || retType == "::llvm::APFloat")
         break;
 
+      defaultValuedAttrLikeStartIndex = i;
       defaultValuedAttrStartIndex = i;
     }
   }
@@ -3001,8 +3115,10 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
     if ((attrParamKind == AttrParamKind::WrappedAttr &&
          canUseUnwrappedRawValue(attr)) ||
         (attrParamKind == AttrParamKind::UnwrappedValue &&
-         !canUseUnwrappedRawValue(attr)))
+         !canUseUnwrappedRawValue(attr))) {
       ++defaultValuedAttrStartIndex;
+      defaultValuedAttrLikeStartIndex = defaultValuedAttrStartIndex;
+    }
   }
 
   /// Collect any inferred attributes.
@@ -3029,8 +3145,16 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
                              operand->isOptional());
       continue;
     }
-    if (llvm::isa_and_present<NamedProperty *>(arg)) {
-      // TODO
+    if (auto *propArg = llvm::dyn_cast_if_present<NamedProperty *>(arg)) {
+      const Property &prop = propArg->prop;
+      StringRef type = prop.getInterfaceType();
+      std::string defaultValue;
+      if (prop.hasDefaultValue() && i >= defaultValuedAttrLikeStartIndex) {
+        defaultValue = prop.getDefaultValue();
+      }
+      bool isOptional = prop.hasDefaultValue();
+      paramList.emplace_back(type, propArg->name, StringRef(defaultValue),
+                             isOptional);
       continue;
     }
     const NamedAttribute &namedAttr = *arg.get<NamedAttribute *>();
@@ -3157,6 +3281,15 @@ void OpEmitter::genCodeForAddingArgAndRegionForBuilder(
     }
   }
 
+  // Push all properties to the result.
+  for (const auto &namedProp : op.getProperties()) {
+    // Use the setter from the Properties struct since the conversion from the
+    // interface type (used in the builder argument) to the storage type (used
+    // in the state) is not necessarily trivial.
+    std::string setterName = op.getSetterName(namedProp.name);
+    body << formatv("  {0}.getOrAddProperties<Properties>().{1}({2});\n",
+                    builderOpState, setterName, namedProp.name);
+  }
   // Push all attributes to the result.
   for (const auto &namedAttr : op.getAttributes()) {
     auto &attr = namedAttr.attr;
@@ -3996,17 +4129,19 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(
         // Generate the data member using the storage type.
         os << "    using " << name << "Ty = " << prop.getStorageType() << ";\n"
            << "    " << name << "Ty " << name;
-        if (prop.hasDefaultValue())
+        if (prop.hasStorageTypeValueOverride())
+          os << " = " << prop.getStorageTypeValueOverride();
+        else if (prop.hasDefaultValue())
           os << " = " << prop.getDefaultValue();
         comparatorOs << "        rhs." << name << " == this->" << name
                      << " &&\n";
         // Emit accessors using the interface type.
         const char *accessorFmt = R"decl(;
-    {0} get{1}() {
+    {0} get{1}() const {
       auto &propStorage = this->{2};
       return {3};
     }
-    void set{1}(const {0} &propValue) {
+    void set{1}({0} propValue) {
       auto &propStorage = this->{2};
       {4};
     }
@@ -4274,6 +4409,11 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(
     ERROR_IF_PRUNED(m, "Adaptor::getAttributes", op);
     m->body() << "  return odsAttrs;";
   }
+  for (auto &namedProp : op.getProperties()) {
+    std::string name = op.getGetterName(namedProp.name);
+    emitPropGetter(genericAdaptorBase, op, name, namedProp.prop);
+  }
+
   for (auto &namedAttr : op.getAttributes()) {
     const auto &name = namedAttr.name;
     const auto &attr = namedAttr.attr;
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index a97d876..27ad79a 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -45,7 +45,7 @@ public:
   OpVariableElement(const VarT *var) : var(var) {}
 
   /// Get the variable.
-  const VarT *getVar() { return var; }
+  const VarT *getVar() const { return var; }
 
 protected:
   /// The op variable, e.g. a type or attribute constraint.
@@ -64,11 +64,6 @@ struct AttributeVariable
     return attrType ? attrType->getBuilderCall() : std::nullopt;
   }
 
-  /// Return if this attribute refers to a UnitAttr.
-  bool isUnitAttr() const {
-    return var->attr.getBaseAttr().getAttrDefName() == "UnitAttr";
-  }
-
   /// Indicate if this attribute is printed "qualified" (that is it is
   /// prefixed with the `#dialect.mnemonic`).
   bool shouldBeQualified() { return shouldBeQualifiedFlag; }
@@ -98,6 +93,42 @@ using SuccessorVariable =
 /// This class represents a variable that refers to a property argument.
 using PropertyVariable =
     OpVariableElement<NamedProperty, VariableElement::Property>;
+
+/// LLVM RTTI helper for attribute-like variables, that is, attributes or
+/// properties. This allows for common handling of attributes and properties in
+/// parts of the code that are oblivious to whether something is stored as an
+/// attribute or a property.
+struct AttributeLikeVariable : public VariableElement {
+  enum { AttributeLike = 1 << 0 };
+
+  static bool classof(const VariableElement *ve) {
+    return ve->getKind() == VariableElement::Attribute ||
+           ve->getKind() == VariableElement::Property;
+  }
+
+  static bool classof(const FormatElement *fe) {
+    return isa<VariableElement>(fe) && classof(cast<VariableElement>(fe));
+  }
+
+  /// Returns true if the variable is a UnitAttr or a UnitProperty.
+  bool isUnit() const {
+    if (const auto *attr = dyn_cast<AttributeVariable>(this))
+      return attr->getVar()->attr.getBaseAttr().getAttrDefName() == "UnitAttr";
+    if (const auto *prop = dyn_cast<PropertyVariable>(this)) {
+      return prop->getVar()->prop.getBaseProperty().getPropertyDefName() ==
+             "UnitProperty";
+    }
+    llvm_unreachable("Type that wasn't listed in classof()");
+  }
+
+  StringRef getName() const {
+    if (const auto *attr = dyn_cast<AttributeVariable>(this))
+      return attr->getVar()->name;
+    if (const auto *prop = dyn_cast<PropertyVariable>(this))
+      return prop->getVar()->name;
+    llvm_unreachable("Type that wasn't listed in classof()");
+  }
+};
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -214,11 +245,11 @@ public:
 
   /// If the parsing element is a single UnitAttr element, then it returns the
   /// attribute variable. Otherwise, returns nullptr.
-  AttributeVariable *
-  getUnitAttrParsingElement(ArrayRef<FormatElement *> pelement) {
+  AttributeLikeVariable *
+  getUnitVariableParsingElement(ArrayRef<FormatElement *> pelement) {
     if (pelement.size() == 1) {
-      auto *attrElem = dyn_cast<AttributeVariable>(pelement[0]);
-      if (attrElem && attrElem->isUnitAttr())
+      auto *attrElem = dyn_cast<AttributeLikeVariable>(pelement[0]);
+      if (attrElem && attrElem->isUnit())
         return attrElem;
     }
     return nullptr;
@@ -488,6 +519,36 @@ const char *const enumAttrParserCode = R"(
   }
 )";
 
+/// The code snippet used to generate a parser call for a property.
+/// {0}: The name of the property
+/// {1}: The C++ class name of the operation
+/// {2}: The property's parser code with appropriate substitutions performed
+/// {3}: The description of the expected property for the error message.
+const char *const propertyParserCode = R"(
+  auto {0}PropLoc = parser.getCurrentLocation();
+  auto {0}PropParseResult = [&](auto& propStorage) -> ::mlir::ParseResult {{
+    {2}
+    return ::mlir::success();
+  }(result.getOrAddProperties<{1}::Properties>().{0});
+  if (failed({0}PropParseResult)) {{
+    return parser.emitError({0}PropLoc, "invalid value for property {0}, expected {3}");
+  }
+)";
+
+/// The code snippet used to generate a parser call for a property.
+/// {0}: The name of the property
+/// {1}: The C++ class name of the operation
+/// {2}: The property's parser code with appropriate substitutions performed
+const char *const optionalPropertyParserCode = R"(
+  auto {0}PropParseResult = [&](auto& propStorage) -> ::mlir::OptionalParseResult {{
+    {2}
+    return ::mlir::success();
+  }(result.getOrAddProperties<{1}::Properties>().{0});
+  if ({0}PropParseResult.has_value() && failed(*{0}PropParseResult)) {{
+    return ::mlir::failure();
+  }
+)";
+
 /// The code snippet used to generate a parser call for an operand.
 ///
 /// {0}: The name of the operand.
@@ -796,9 +857,9 @@ static void genElementParserStorage(FormatElement *element, const Operator &op,
 
     // If the anchor is a unit attribute, it won't be parsed directly so elide
     // it.
-    auto *anchor = dyn_cast<AttributeVariable>(optional->getAnchor());
+    auto *anchor = dyn_cast<AttributeLikeVariable>(optional->getAnchor());
     FormatElement *elidedAnchorElement = nullptr;
-    if (anchor && anchor != elements.front() && anchor->isUnitAttr())
+    if (anchor && anchor != elements.front() && anchor->isUnit())
       elidedAnchorElement = anchor;
     for (FormatElement *childElement : elements)
       if (childElement != elidedAnchorElement)
@@ -808,7 +869,7 @@ static void genElementParserStorage(FormatElement *element, const Operator &op,
 
   } else if (auto *oilist = dyn_cast<OIListElement>(element)) {
     for (ArrayRef<FormatElement *> pelement : oilist->getParsingElements()) {
-      if (!oilist->getUnitAttrParsingElement(pelement))
+      if (!oilist->getUnitVariableParsingElement(pelement))
         for (FormatElement *element : pelement)
           genElementParserStorage(element, op, body);
     }
@@ -1049,7 +1110,6 @@ static void genCustomDirectiveParser(CustomDirective *dir, MethodBody &body,
         body << llvm::formatv("    result.addAttribute(\"{0}\", {0}Attr);\n",
                               var->name);
       }
-
     } else if (auto *operand = dyn_cast<OperandVariable>(param)) {
       const NamedTypeConstraint *var = operand->getVar();
       if (var->isOptional()) {
@@ -1137,6 +1197,29 @@ static void genEnumAttrParser(const NamedAttribute *var, MethodBody &body,
                   validCaseKeywordsStr, errorMessage, attrAssignment);
 }
 
+// Generate the parser for a property.
+static void genPropertyParser(PropertyVariable *propVar, MethodBody &body,
+                              StringRef opCppClassName,
+                              bool requireParse = true) {
+  StringRef name = propVar->getVar()->name;
+  const Property &prop = propVar->getVar()->prop;
+  bool parseOptionally =
+      prop.hasDefaultValue() && !requireParse && prop.hasOptionalParser();
+  FmtContext fmtContext;
+  fmtContext.addSubst("_parser", "parser");
+  fmtContext.addSubst("_ctxt", "parser.getContext()");
+  fmtContext.addSubst("_storage", "propStorage");
+
+  if (parseOptionally) {
+    body << formatv(optionalPropertyParserCode, name, opCppClassName,
+                    tgfmt(prop.getOptionalParserCall(), &fmtContext));
+  } else {
+    body << formatv(propertyParserCode, name, opCppClassName,
+                    tgfmt(prop.getParserCall(), &fmtContext),
+                    prop.getSummary());
+  }
+}
+
 // Generate the parser for an attribute.
 static void genAttrParser(AttributeVariable *attr, MethodBody &body,
                           FmtContext &attrTypeCtx, bool parseAsOptional,
@@ -1213,14 +1296,16 @@ if (!dict) {
 }
 )decl";
 
-  // TODO: properties might be optional as well.
+  // {0}: fromAttribute call
+  // {1}: property name
+  // {2}: isRequired
   const char *propFromAttrFmt = R"decl(
 auto setFromAttr = [] (auto &propStorage, ::mlir::Attribute propAttr,
-         ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) {{
+         ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) -> ::mlir::LogicalResult {{
   {0};
 };
 auto attr = dict.get("{1}");
-if (!attr) {{
+if (!attr && {2}) {{
   emitError() << "expected key entry for {1} in DictionaryAttr to set "
              "Properties.";
   return ::mlir::failure();
@@ -1238,13 +1323,14 @@ if (::mlir::failed(setFromAttr(prop.{1}, attr, emitError)))
 
     StringRef name = namedProperty.name;
     const Property &prop = namedProperty.prop;
+    bool isRequired = !prop.hasDefaultValue();
     FmtContext fctx;
     body << formatv(propFromAttrFmt,
                     tgfmt(prop.getConvertFromAttributeCall(),
                           &fctx.addSubst("_attr", "propAttr")
                                .addSubst("_storage", "propStorage")
                                .addSubst("_diag", "emitError")),
-                    name);
+                    name, isRequired);
   }
 
   // Generate the setter for any attribute not parsed elsewhere.
@@ -1331,20 +1417,24 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
       // If the anchor is a unit attribute, we don't need to print it. When
       // parsing, we will add this attribute if this group is present.
       FormatElement *elidedAnchorElement = nullptr;
-      auto *anchorAttr = dyn_cast<AttributeVariable>(optional->getAnchor());
-      if (anchorAttr && anchorAttr != firstElement &&
-          anchorAttr->isUnitAttr()) {
-        elidedAnchorElement = anchorAttr;
+      auto *anchorVar = dyn_cast<AttributeLikeVariable>(optional->getAnchor());
+      if (anchorVar && anchorVar != firstElement && anchorVar->isUnit()) {
+        elidedAnchorElement = anchorVar;
 
         if (!thenGroup == optional->isInverted()) {
-          // Add the anchor unit attribute to the operation state.
-          if (useProperties) {
+          // Add the anchor unit attribute or property to the operation state
+          // or set the property to true.
+          if (isa<PropertyVariable>(anchorVar)) {
+            body << formatv(
+                "    result.getOrAddProperties<{1}::Properties>().{0} = true;",
+                anchorVar->getName(), opCppClassName);
+          } else if (useProperties) {
             body << formatv(
                 "    result.getOrAddProperties<{1}::Properties>().{0} = "
                 "parser.getBuilder().getUnitAttr();",
-                anchorAttr->getVar()->name, opCppClassName);
+                anchorVar->getName(), opCppClassName);
           } else {
-            body << "    result.addAttribute(\"" << anchorAttr->getVar()->name
+            body << "    result.addAttribute(\"" << anchorVar->getName()
                  << "\", parser.getBuilder().getUnitAttr());\n";
           }
         }
@@ -1368,6 +1458,12 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
       genAttrParser(attrVar, body, attrTypeCtx, /*parseAsOptional=*/true,
                     useProperties, opCppClassName);
       body << "  if (" << attrVar->getVar()->name << "Attr) {\n";
+    } else if (auto *propVar = dyn_cast<PropertyVariable>(firstElement)) {
+      genPropertyParser(propVar, body, opCppClassName, /*requireParse=*/false);
+      body << llvm::formatv("if ({0}PropParseResult.has_value() && "
+                            "succeeded(*{0}PropParseResult)) ",
+                            propVar->getVar()->name)
+           << " {\n";
     } else if (auto *literal = dyn_cast<LiteralElement>(firstElement)) {
       body << "  if (::mlir::succeeded(parser.parseOptional";
       genLiteralParser(literal->getSpelling(), body);
@@ -1430,15 +1526,19 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
       body << ")) {\n";
       StringRef lelementName = lelement->getSpelling();
       body << formatv(oilistParserCode, lelementName);
-      if (AttributeVariable *unitAttrElem =
-              oilist->getUnitAttrParsingElement(pelement)) {
-        if (useProperties) {
+      if (AttributeLikeVariable *unitVarElem =
+              oilist->getUnitVariableParsingElement(pelement)) {
+        if (isa<PropertyVariable>(unitVarElem)) {
+          body << formatv(
+              "    result.getOrAddProperties<{1}::Properties>().{0} = true;",
+              unitVarElem->getName(), opCppClassName);
+        } else if (useProperties) {
           body << formatv(
               "    result.getOrAddProperties<{1}::Properties>().{0} = "
               "parser.getBuilder().getUnitAttr();",
-              unitAttrElem->getVar()->name, opCppClassName);
+              unitVarElem->getName(), opCppClassName);
         } else {
-          body << "  result.addAttribute(\"" << unitAttrElem->getVar()->name
+          body << "  result.addAttribute(\"" << unitVarElem->getName()
                << "\", UnitAttr::get(parser.getContext()));\n";
         }
       } else {
@@ -1468,6 +1568,8 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
         (genCtx == GenContext::Normal && attr->getVar()->attr.isOptional());
     genAttrParser(attr, body, attrTypeCtx, parseAsOptional, useProperties,
                   opCppClassName);
+  } else if (auto *prop = dyn_cast<PropertyVariable>(element)) {
+    genPropertyParser(prop, body, opCppClassName);
 
   } else if (auto *operand = dyn_cast<OperandVariable>(element)) {
     ArgumentLengthKind lengthKind = getArgumentLengthKind(operand->getVar());
@@ -1876,6 +1978,38 @@ const char *enumAttrBeginPrinterCode = R"(
     auto caseValueStr = {1}(caseValue);
 )";
 
+/// Generate a check that an optional or default-valued attribute or property
+/// has a non-default value. For these purposes, the default value of an
+/// optional attribute is its presence, even if the attribute itself has a
+/// default value.
+static void genNonDefaultValueCheck(MethodBody &body, const Operator &op,
+                                    AttributeVariable &attrElement) {
+  Attribute attr = attrElement.getVar()->attr;
+  std::string getter = op.getGetterName(attrElement.getVar()->name);
+  bool optionalAndDefault = attr.isOptional() && attr.hasDefaultValue();
+  if (optionalAndDefault)
+    body << "(";
+  if (attr.isOptional())
+    body << getter << "Attr()";
+  if (optionalAndDefault)
+    body << " && ";
+  if (attr.hasDefaultValue()) {
+    FmtContext fctx;
+    fctx.withBuilder("::mlir::OpBuilder((*this)->getContext())");
+    body << getter << "Attr() != "
+         << tgfmt(attr.getConstBuilderTemplate(), &fctx,
+                  attr.getDefaultValue());
+  }
+  if (optionalAndDefault)
+    body << ")";
+}
+
+static void genNonDefaultValueCheck(MethodBody &body, const Operator &op,
+                                    PropertyVariable &propElement) {
+  body << op.getGetterName(propElement.getVar()->name)
+       << "() != " << propElement.getVar()->prop.getDefaultValue();
+}
+
 /// Generate the printer for the 'prop-dict' directive.
 static void genPropDictPrinter(OperationFormat &fmt, Operator &op,
                                MethodBody &body) {
@@ -1904,6 +2038,15 @@ static void genPropDictPrinter(OperationFormat &fmt, Operator &op,
       body << "  }\n";
     }
   }
+  // Similarly, elide default-valued properties.
+  for (const NamedProperty &prop : op.getProperties()) {
+    if (prop.prop.hasDefaultValue()) {
+      body << "  if (" << op.getGetterName(prop.name)
+           << "() == " << prop.prop.getDefaultValue() << ") {";
+      body << "    elidedProps.push_back(\"" << prop.name << "\");\n";
+      body << "  }\n";
+    }
+  }
 
   body << "  _odsPrinter << \" \";\n"
        << "  printProperties(this->getContext(), _odsPrinter, "
@@ -2031,7 +2174,6 @@ static void genCustomDirectiveParameterPrinter(FormatElement *element,
 
   } else if (auto *property = dyn_cast<PropertyVariable>(element)) {
     FmtContext ctx;
-    ctx.addSubst("_ctxt", "getContext()");
     const NamedProperty *namedProperty = property->getVar();
     ctx.addSubst("_storage", "getProperties()." + namedProperty->name);
     body << tgfmt(namedProperty->prop.getConvertFromStorageCall(), &ctx);
@@ -2154,16 +2296,6 @@ static void genEnumAttrPrinter(const NamedAttribute *var, const Operator &op,
           "  }\n";
 }
 
-/// Generate a check that a DefaultValuedAttr has a value that is non-default.
-static void genNonDefaultValueCheck(MethodBody &body, const Operator &op,
-                                    AttributeVariable &attrElement) {
-  FmtContext fctx;
-  Attribute attr = attrElement.getVar()->attr;
-  fctx.withBuilder("::mlir::OpBuilder((*this)->getContext())");
-  body << " && " << op.getGetterName(attrElement.getVar()->name) << "Attr() != "
-       << tgfmt(attr.getConstBuilderTemplate(), &fctx, attr.getDefaultValue());
-}
-
 /// Generate the check for the anchor of an optional group.
 static void genOptionalGroupPrinterAnchor(FormatElement *anchor,
                                           const Operator &op,
@@ -2190,17 +2322,12 @@ static void genOptionalGroupPrinterAnchor(FormatElement *anchor,
         genOptionalGroupPrinterAnchor(element->getInputs(), op, body);
       })
       .Case([&](AttributeVariable *element) {
-        Attribute attr = element->getVar()->attr;
-        body << op.getGetterName(element->getVar()->name) << "Attr()";
-        if (attr.isOptional())
-          return; // done
-        if (attr.hasDefaultValue()) {
-          // Consider a default-valued attribute as present if it's not the
-          // default value.
-          genNonDefaultValueCheck(body, op, *element);
-          return;
-        }
-        llvm_unreachable("attribute must be optional or default-valued");
+        // Consider a default-valued attribute as present if it's not the
+        // default value and an optional one present if it is set.
+        genNonDefaultValueCheck(body, op, *element);
+      })
+      .Case([&](PropertyVariable *element) {
+        genNonDefaultValueCheck(body, op, *element);
       })
       .Case([&](CustomDirective *ele) {
         body << '(';
@@ -2276,10 +2403,10 @@ void OperationFormat::genElementPrinter(FormatElement *element,
     ArrayRef<FormatElement *> thenElements = optional->getThenElements();
     ArrayRef<FormatElement *> elseElements = optional->getElseElements();
     FormatElement *elidedAnchorElement = nullptr;
-    auto *anchorAttr = dyn_cast<AttributeVariable>(anchor);
+    auto *anchorAttr = dyn_cast<AttributeLikeVariable>(anchor);
     if (anchorAttr && anchorAttr != thenElements.front() &&
         (elseElements.empty() || anchorAttr != elseElements.front()) &&
-        anchorAttr->isUnitAttr()) {
+        anchorAttr->isUnit()) {
       elidedAnchorElement = anchorAttr;
     }
     auto genElementPrinters = [&](ArrayRef<FormatElement *> elements) {
@@ -2319,13 +2446,13 @@ void OperationFormat::genElementPrinter(FormatElement *element,
       for (VariableElement *var : vars) {
         TypeSwitch<FormatElement *>(var)
             .Case([&](AttributeVariable *attrEle) {
-              body << " || (" << op.getGetterName(attrEle->getVar()->name)
-                   << "Attr()";
-              Attribute attr = attrEle->getVar()->attr;
-              if (attr.hasDefaultValue()) {
-                // Don't print default-valued attributes.
-                genNonDefaultValueCheck(body, op, *attrEle);
-              }
+              body << " || (";
+              genNonDefaultValueCheck(body, op, *attrEle);
+              body << ")";
+            })
+            .Case([&](PropertyVariable *propEle) {
+              body << " || (";
+              genNonDefaultValueCheck(body, op, *propEle);
               body << ")";
             })
             .Case([&](OperandVariable *ele) {
@@ -2352,7 +2479,7 @@ void OperationFormat::genElementPrinter(FormatElement *element,
       body << ") {\n";
       genLiteralPrinter(lelement->getSpelling(), body, shouldEmitSpace,
                         lastWasPunctuation);
-      if (oilist->getUnitAttrParsingElement(pelement) == nullptr) {
+      if (oilist->getUnitVariableParsingElement(pelement) == nullptr) {
         for (FormatElement *element : pelement)
           genElementPrinter(element, body, op, shouldEmitSpace,
                             lastWasPunctuation);
@@ -2369,7 +2496,7 @@ void OperationFormat::genElementPrinter(FormatElement *element,
     return;
   }
 
-  // Emit the attribute dictionary.
+  // Emit the property dictionary.
   if (isa<PropDictDirective>(element)) {
     genPropDictPrinter(*this, op, body);
     lastWasPunctuation = false;
@@ -2408,6 +2535,13 @@ void OperationFormat::genElementPrinter(FormatElement *element,
     else
       body << "_odsPrinter.printStrippedAttrOrType("
            << op.getGetterName(var->name) << "Attr());\n";
+  } else if (auto *property = dyn_cast<PropertyVariable>(element)) {
+    const NamedProperty *var = property->getVar();
+    FmtContext fmtContext;
+    fmtContext.addSubst("_printer", "_odsPrinter");
+    fmtContext.addSubst("_ctxt", "getContext()");
+    fmtContext.addSubst("_storage", "getProperties()." + var->name);
+    body << tgfmt(var->prop.getPrinterCall(), &fmtContext) << ";\n";
   } else if (auto *operand = dyn_cast<OperandVariable>(element)) {
     if (operand->getVar()->isVariadicOfVariadic()) {
       body << "  ::llvm::interleaveComma("
@@ -2737,6 +2871,10 @@ static bool isOptionallyParsed(FormatElement *el) {
     Attribute attr = attrVar->getVar()->attr;
     return attr.isOptional() || attr.hasDefaultValue();
   }
+  if (auto *propVar = dyn_cast<PropertyVariable>(el)) {
+    const Property &prop = propVar->getVar()->prop;
+    return prop.hasDefaultValue() && prop.hasOptionalParser();
+  }
   if (auto *operandVar = dyn_cast<OperandVariable>(el)) {
     const NamedTypeConstraint *operand = operandVar->getVar();
     return operand->isOptional() || operand->isVariadic() ||
@@ -3141,10 +3279,9 @@ OpFormatParser::parseVariableImpl(SMLoc loc, StringRef name, Context ctx) {
   }
 
   if (const NamedProperty *property = findArg(op.getProperties(), name)) {
-    if (ctx != CustomDirectiveContext && ctx != RefDirectiveContext)
+    if (ctx == TypeDirectiveContext)
       return emitError(
-          loc, "properties currently only supported in `custom` directive");
-
+          loc, "properties cannot be used as children to a `type` directive");
     if (ctx == RefDirectiveContext) {
       if (!seenProperties.count(property))
         return emitError(loc, "property '" + name +
@@ -3428,6 +3565,15 @@ LogicalResult OpFormatParser::verifyOIListParsingElement(FormatElement *element,
                                       "an oilist parsing group");
               return success();
             })
+            // Only optional properties can be within an oilist parsing group.
+            .Case([&](PropertyVariable *propEle) {
+              if (!propEle->getVar()->prop.hasDefaultValue())
+                return emitError(
+                    loc,
+                    "only default-valued or optional properties can be used in "
+                    "an olist parsing group");
+              return success();
+            })
             // Only optional-like(i.e. variadic) operands can be within an
             // oilist parsing group.
             .Case([&](OperandVariable *ele) {
@@ -3557,6 +3703,16 @@ LogicalResult OpFormatParser::verifyOptionalGroupElement(SMLoc loc,
                                 "can be used to anchor an optional group");
         return success();
       })
+      // All properties can be within the optional group, but only optional
+      // properties can be the anchor.
+      .Case([&](PropertyVariable *propEle) {
+        Property prop = propEle->getVar()->prop;
+        if (isAnchor && !(prop.hasDefaultValue() && prop.hasOptionalParser()))
+          return emitError(loc, "only properties with default values "
+                                "that can be optionally parsed "
+                                "can be used to anchor an optional group");
+        return success();
+      })
       // Only optional-like(i.e. variadic) operands can be within an optional
       // group.
       .Case([&](OperandVariable *ele) {
diff --git a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
index a1c089a..907fe10 100644
--- a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
@@ -10,6 +10,12 @@ package(
 
 licenses(["notice"])
 
+genrule(
+    name = "generate_vcs_revision",
+    outs = ["include/VCSVersion.inc"],
+    cmd = "echo '#undef BOLT_REVISION' >> $@\n",
+)
+
 cc_binary(
     name = "llvm-bolt-heatmap",
     srcs = glob([
@@ -114,6 +120,7 @@ cc_library(
     textual_hdrs = glob([
         "include/bolt/RuntimeLibs/*.h",
     ]) + ["include/bolt/RuntimeLibs/RuntimeLibraryVariables.inc"],
+    defines=["CMAKE_INSTALL_FULL_LIBDIR=\\\"\\\""],
     deps = [
         ":Core",
         ":Passes",
@@ -289,7 +296,9 @@ cc_library(
     srcs = glob([
         "lib/Utils/*.cpp",
     ]),
-    hdrs = glob([
+    hdrs = [
+        "include/VCSVersion.inc",
+    ] + glob([
         "include/bolt/Utils/*.h",
     ]),
     includes = ["include"],
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 64d36c7..4d443e8 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -944,10 +944,7 @@ cc_library(
     srcs = glob([
         "lib/IR/*.cpp",
         "lib/IR/*.h",
-    ]) + [
-        # To avoid a dependency cycle.
-        "include/llvm/Analysis/IVDescriptors.h",
-    ],
+    ]),
     hdrs = glob(
         [
             "include/llvm/*.h",
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 536c825..f83c471 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -2142,6 +2142,7 @@ cc_library(
         ":FuncTransforms",
         ":FunctionInterfaces",
         ":IR",
+        ":IndexDialect",
         ":LLVMCommonConversion",
         ":LLVMDialect",
         ":MemRefDialect",
@@ -2149,6 +2150,7 @@ cc_library(
         ":SCFDialect",
         ":SCFTransforms",
         ":TransformUtils",
+        ":VectorUtils",
         "//llvm:Support",
     ],
 )
@@ -5744,6 +5746,7 @@ cc_library(
         "lib/Conversion/GPUCommon/OpToFuncCallLowering.h",
     ],
     deps = [
+        ":ArithDialect",
         ":GPUDialect",
         ":IR",
         ":LLVMCommonConversion",
author	Vitaly Buka <vitalybuka@google.com>	2024-07-26 10:20:26 -0700
committer	Vitaly Buka <vitalybuka@google.com>	2024-07-26 10:20:26 -0700
commit	f75b8a41bbf63c4424af9cc289ce958ba3364342 (patch)
tree	a7d5a2029b12fd7e0ea1ec129204da5498e9cd8e
parent	7c586d05f48f39515fd981a44d75efb5356975c2 (diff)
parent	d683d378998c85c12d7f0549944f807bb44c7b76 (diff)
download	llvm-users/vitalybuka/spr/main.ubsanhwasan-let-mixing-filters.zip llvm-users/vitalybuka/spr/main.ubsanhwasan-let-mixing-filters.tar.gz llvm-users/vitalybuka/spr/main.ubsanhwasan-let-mixing-filters.tar.bz2