67 files changed, 657 insertions, 524 deletions
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 06e840e..e1a1856 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -346,13 +346,16 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
   uint64_t Bytes;
   Function.markUnreachableBlocks();
   LLVM_DEBUG({
+    bool HasInvalidBB = false;
     for (BinaryBasicBlock &BB : Function) {
       if (!BB.isValid()) {
+        HasInvalidBB = true;
         dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
                << " in function " << Function << "\n";
-        Function.dump();
       }
     }
+    if (HasInvalidBB)
+      Function.dump();
   });
   BinaryContext::IndependentCodeEmitter Emitter =
       BC.createIndependentMCCodeEmitter();
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index bef6e9c1..495f2ab 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -32,7 +32,7 @@ Feature Checking Macros
 =======================
 
 Language extensions can be very useful, but only if you know you can depend on
-them.  In order to allow fine-grain features checks, we support three builtin
+them.  In order to allow fine-grained feature checks, we support three builtin
 function-like macros.  This allows you to directly test for a feature in your
 code without having to resort to something like autoconf or fragile "compiler
 version checks".
@@ -137,7 +137,7 @@ feature) or 0 if not.  They can be used like this:
 .. _langext-has-feature-back-compat:
 
 For backward compatibility, ``__has_feature`` can also be used to test
-for support for non-standardized features, i.e. features not prefixed ``c_``,
+for support for non-standardized features, i.e., features not prefixed ``c_``,
 ``cxx_`` or ``objc_``.
 
 Another use of ``__has_feature`` is to check for compiler features not related
@@ -159,7 +159,7 @@ of ``cxx_rvalue_references``.
 
 This function-like macro is available in C++20 by default, and is provided as an
 extension in earlier language standards. It takes a single argument that is the
-name of a double-square-bracket-style attribute. The argument can either be a
+name of a double-square-bracket-style attribute. The argument can be either a
 single identifier or a scoped identifier. If the attribute is supported, a
 nonzero value is returned. If the attribute is a standards-based attribute, this
 macro returns a nonzero value based on the year and month in which the attribute
@@ -320,7 +320,7 @@ To test for this feature, use ``#if defined(__has_include)``:
 
 .. code-block:: c++
 
-  // To avoid problem with non-clang compilers not having this macro.
+  // To avoid problems with non-clang compilers not having this macro.
   #if defined(__has_include)
   #if __has_include("myinclude.h")
   # include "myinclude.h"
@@ -345,7 +345,7 @@ or 0 otherwise:
   # include_next "myinclude.h"
   #endif
 
-  // To avoid problem with non-clang compilers not having this macro.
+  // To avoid problems with non-clang compilers not having this macro.
   #if defined(__has_include_next)
   #if __has_include_next("myinclude.h")
   # include_next "myinclude.h"
@@ -361,7 +361,7 @@ absolute path is used in the file argument.
 -----------------
 
 This function-like macro takes a string literal that represents a command line
-option for a warning and returns true if that is a valid warning option.
+option for a warning and returns ``true`` if that is a valid warning option.
 
 .. code-block:: c++
 
@@ -396,7 +396,7 @@ Builtin Macros
   file.
 
 ``__clang__``
-  Defined when compiling with Clang
+  Defined when compiling with Clang.
 
 ``__clang_major__``
   Defined to the major marketing version number of Clang (e.g., the 2 in
@@ -599,7 +599,7 @@ NEON vector types are created using ``neon_vector_type`` and
 
 GCC vector types are created using the ``vector_size(N)`` attribute.  The
 argument ``N`` specifies the number of bytes that will be allocated for an
-object of this type.  The size has to be multiple of the size of the vector
+object of this type.  The size has to be a multiple of the size of the vector
 element type. For example:
 
 .. code-block:: c++
@@ -620,7 +620,7 @@ element type. For example:
 Boolean Vectors
 ---------------
 
-Clang also supports the ext_vector_type attribute with boolean element types in
+Clang also supports the ``ext_vector_type`` attribute with boolean element types in
 C and C++.  For example:
 
 .. code-block:: c++
@@ -674,7 +674,7 @@ Vector Literals
 
 Vector literals can be used to create vectors from a set of scalars, or
 vectors.  Either parentheses or braces form can be used.  In the parentheses
-form the number of literal values specified must be one, i.e. referring to a
+form the number of literal values specified must be one, i.e., referring to a
 scalar value, or must match the size of the vector type being created.  If a
 single scalar literal value is specified, the scalar literal value will be
 replicated to all the components of the vector type.  In the brackets form any
@@ -725,12 +725,12 @@ address &v[i]                    no      no        no [#]_     no     no
 
 See also :ref:`langext-__builtin_shufflevector`, :ref:`langext-__builtin_convertvector`.
 
-.. [#] ternary operator(?:) has different behaviors depending on condition
-  operand's vector type. If the condition is a GNU vector (i.e. __vector_size__),
+.. [#] ternary operator(?:) has different behaviors depending on the condition
+  operand's vector type. If the condition is a GNU vector (i.e., ``__vector_size__``),
   a NEON vector or an SVE vector, it's only available in C++ and uses normal bool
   conversions (that is, != 0).
   If it's an extension (OpenCL) vector, it's only available in C and OpenCL C.
-  And it selects based on signedness of the condition operands (OpenCL v1.1 s6.3.9).
+  And it selects based on the signedness of the condition operands (OpenCL v1.1 s6.3.9).
 .. [#] sizeof can only be used on vector length specific SVE types.
 .. [#] Clang does not allow the address of an element to be taken while GCC
    allows this. This is intentional for vectors with a boolean element type and
@@ -747,7 +747,7 @@ to perform additional operations on certain scalar and vector types.
 Let ``T`` be one of the following types:
 
 * an integer type (as in C23 6.2.5p22), but excluding enumerated types and ``bool``
-* the standard floating types float or double
+* the standard floating types ``float`` or ``double``
 * a half-precision floating point type, if one is supported on the target
 * a vector type.
 
@@ -833,7 +833,7 @@ of different sizes and signs is forbidden in binary and ternary builtins.
  T __builtin_elementwise_canonicalize(T x)      return the platform specific canonical encoding                        floating point types
                                                 of a floating-point number
  T __builtin_elementwise_copysign(T x, T y)     return the magnitude of x with the sign of y.                          floating point types
- T __builtin_elementwise_fmod(T x, T y)         return The floating-point remainder of (x/y) whose sign                floating point types
+ T __builtin_elementwise_fmod(T x, T y)         return the floating-point remainder of (x/y) whose sign                floating point types
                                                 matches the sign of x.
  T __builtin_elementwise_max(T x, T y)          return x or y, whichever is larger                                     integer and floating point types
                                                 For floating point types, follows semantics of maxNum
@@ -948,7 +948,7 @@ Let ``VT`` be a vector type and ``ET`` the element type of ``VT``.
 
 Each builtin accesses memory according to a provided boolean mask. These are
 provided as ``__builtin_masked_load`` and ``__builtin_masked_store``. The first
-argument is always boolean mask vector. The ``__builtin_masked_load`` builtin
+argument is always a boolean mask vector. The ``__builtin_masked_load`` builtin
 takes an optional third vector argument that will be used for the result of the
 masked-off lanes. These builtins assume the memory is unaligned, use
 ``__builtin_assume_aligned`` if alignment is desired.
@@ -1109,7 +1109,7 @@ to ``float``; see below for more information on this emulation.
 
 ``__fp16`` and ``_Float16`` both use the binary16 format from IEEE
 754-2008, which provides a 5-bit exponent and an 11-bit significand
-(counting the implicit leading 1). ``__bf16`` uses the `bfloat16
+(including the implicit leading 1). ``__bf16`` uses the `bfloat16
 <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format>`_ format,
 which provides an 8-bit exponent and an 8-bit significand; this is the same
 exponent range as `float`, just with greatly reduced precision.
@@ -1143,7 +1143,7 @@ types with the ``-ffloat16-excess-precision=`` and
 
 * ``none``: meaning to perform strict operation-by-operation emulation
 * ``standard``: meaning that excess precision is permitted under the rules
-  described in the standard, i.e. never across explicit casts or statements
+  described in the standard, i.e., never across explicit casts or statements
 * ``fast``: meaning that excess precision is permitted whenever the
   optimizer sees an opportunity to avoid truncations; currently this has no
   effect beyond ``standard``
@@ -1899,7 +1899,7 @@ Type Trait Primitives
 
 Type trait primitives are special builtin constant expressions that can be used
 by the standard C++ library to facilitate or simplify the implementation of
-user-facing type traits in the <type_traits> header.
+user-facing type traits in the ``<type_traits>`` header.
 
 They are not intended to be used directly by user code because they are
 implementation-defined and subject to change -- as such they're tied closely to
@@ -2054,7 +2054,7 @@ The following type trait primitives are supported by Clang. Those traits marked
 * ``__is_volatile`` (C++, Embarcadero)
 * ``__reference_binds_to_temporary(T, U)`` (Clang):  Determines whether a
   reference of type ``T`` bound to an expression of type ``U`` would bind to a
-  materialized temporary object. If ``T`` is not a reference type the result
+  materialized temporary object. If ``T`` is not a reference type, the result
   is false. Note this trait will also return false when the initialization of
   ``T`` from ``U`` is ill-formed.
   Deprecated, use ``__reference_constructs_from_temporary``.
@@ -2182,7 +2182,7 @@ Prior to clang-16, the output may only be used safely when the indirect
 branches are not taken.  Query for this difference with
 ``__has_extension(gnu_asm_goto_with_outputs_full)``.
 
-When using tied-outputs (i.e. outputs that are inputs and outputs, not just
+When using tied-outputs (i.e., outputs that are inputs and outputs, not just
 outputs) with the `+r` constraint, there is a hidden input that's created
 before the label, so numeric references to operands must account for that.
 
@@ -2361,7 +2361,7 @@ Loading from a ``__weak`` variable always implicitly retains the
 loaded value.  In non-ARC modes, this retain is normally balanced
 by an implicit autorelease.  This autorelease can be suppressed
 by performing the load in the receiver position of a ``-retain``
-message send (e.g. ``[weakReference retain]``); note that this performs
+message send (e.g., ``[weakReference retain]``); note that this performs
 only a single retain (the retain done when primitively loading from
 the weak reference).
 
@@ -2562,7 +2562,7 @@ When a method that's introduced in the OS newer than the target OS is called, a
 .. code-block:: objc
 
   void my_fun(NSSomeClass* var) {
-    // If fancyNewMethod was added in e.g. macOS 10.12, but the code is
+    // If fancyNewMethod was added in e.g., macOS 10.12, but the code is
     // built with -mmacos-version-min=10.11, then this unconditional call
     // will emit a -Wunguarded-availability warning:
     [var fancyNewMethod];
@@ -2725,7 +2725,7 @@ correctly in any circumstances. It can be used if:
   metaprogramming algorithms to be able to specify/detect types generically.
 
 - the generated kernel binary does not contain indirect calls because they
-  are eliminated using compiler optimizations e.g. devirtualization.
+  are eliminated using compiler optimizations e.g., devirtualization.
 
 - the selected target supports the function pointer like functionality e.g.
   most CPU targets.
@@ -2755,12 +2755,12 @@ Extension Specification, section 1.2
 <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_Ext.html#extensions-overview>`_.
 
 This is not conformant behavior and it can only be used portably when the
-functions with variadic prototypes do not get generated in binary e.g. the
+functions with variadic prototypes do not get generated in binary e.g., the
 variadic prototype is used to specify a function type with any number of
 arguments in metaprogramming algorithms in C++ for OpenCL.
 
 This extension can also be used when the kernel code is intended for targets
-supporting the variadic arguments e.g. majority of CPU targets.
+supporting the variadic arguments e.g., majority of CPU targets.
 
 **Example of Use**:
 
@@ -5170,8 +5170,8 @@ __builtin_amdgcn_fence
 
 ``__builtin_amdgcn_fence`` emits a fence.
 
-* ``unsigned`` atomic ordering, e.g. ``__ATOMIC_ACQUIRE``
-* ``const char *`` synchronization scope, e.g. ``workgroup``
+* ``unsigned`` atomic ordering, e.g., ``__ATOMIC_ACQUIRE``
+* ``const char *`` synchronization scope, e.g., ``workgroup``
 * Zero or more ``const char *`` address spaces names.
 
 The address spaces arguments must be one of the following string literals:
@@ -5205,7 +5205,7 @@ boolean argument as a bit for every lane of the current wave that is currently
 active (i.e., that is converged with the executing thread), and a 0 bit for
 every lane that is not active.
 
-The result is uniform, i.e. it is the same in every active thread of the wave.
+The result is uniform, i.e., it is the same in every active thread of the wave.
 
 __builtin_amdgcn_inverse_ballot_w{32,64}
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -5374,7 +5374,7 @@ followed by ``off`` or ``on``.
 
 All function definitions in the region between an ``off`` and the following
 ``on`` will be decorated with the ``optnone`` attribute unless doing so would
-conflict with explicit attributes already present on the function (e.g. the
+conflict with explicit attributes already present on the function (e.g., the
 ones that control inlining).
 
 .. code-block:: c++
@@ -5472,12 +5472,12 @@ provides options for vectorization, interleaving, predication, unrolling and
 distribution. Loop hints can be specified before any loop and will be ignored if
 the optimization is not safe to apply.
 
-There are loop hints that control transformations (e.g. vectorization, loop
+There are loop hints that control transformations (e.g., vectorization, loop
 unrolling) and there are loop hints that set transformation options (e.g.
 ``vectorize_width``, ``unroll_count``).  Pragmas setting transformation options
-imply the transformation is enabled, as if it was enabled via the corresponding
-transformation pragma (e.g. ``vectorize(enable)``). If the transformation is
-disabled  (e.g. ``vectorize(disable)``), that takes precedence over
+imply the transformation is enabled, as if it were enabled via the corresponding
+transformation pragma (e.g., ``vectorize(enable)``). If the transformation is
+disabled  (e.g., ``vectorize(disable)``), that takes precedence over
 transformations option pragmas implying that transformation.
 
 Vectorization, Interleaving, and Predication
@@ -5704,7 +5704,7 @@ operation when supported by the target.
 The pragma can take three values: ``on``, ``fast`` and ``off``.  The ``on``
 option is identical to using ``#pragma STDC FP_CONTRACT(ON)`` and it allows
 fusion as specified the language standard.  The ``fast`` option allows fusion
-in cases when the language standard does not make this possible (e.g. across
+in cases when the language standard does not make this possible (e.g., across
 statements in C).
 
 .. code-block:: c++
@@ -5859,14 +5859,14 @@ and ``ignore_denormal_mode``, each optionally prefixed with ``no_``. The meaning
 are as follows:
 
 - ``remote_memory`` means atomic operations may be performed on remote
-  memory, i.e. memory accessed through off-chip interconnects (e.g., PCIe).
+  memory, i.e., memory accessed through off-chip interconnects (e.g., PCIe).
   On ROCm platforms using HIP, remote memory refers to memory accessed via
   PCIe and is subject to specific atomic operation support. See
   `ROCm PCIe Atomics <https://rocm.docs.amd.com/en/latest/conceptual/
   pcie-atomics.html>`_ for further details. Prefixing with ``no_remote_memory`` indicates that
   atomic operations should not be performed on remote memory.
 - ``fine_grained_memory`` means atomic operations may be performed on fine-grained
-  memory, i.e. memory regions that support fine-grained coherence, where updates to
+  memory, i.e., memory regions that support fine-grained coherence, where updates to
   memory are visible to other parts of the system even while modifications are ongoing.
   For example, in HIP, fine-grained coherence ensures that host and device share
   up-to-date data without explicit synchronization (see
@@ -6088,7 +6088,7 @@ match rules are specified after the attribute. The compiler expects an
 identifier that corresponds to the subject set specifier. The ``apply_to``
 specifier is currently the only supported subject set specifier. It allows you
 to specify match rules that form a subset of the attribute's allowed subject
-set, i.e. the compiler doesn't require all of the attribute's subjects. For
+set, i.e., the compiler doesn't require all of the attribute's subjects. For
 example, an attribute like ``[[nodiscard]]`` whose subject set includes
 ``enum``, ``record`` and ``hasType(functionType)``, requires the presence of at
 least one of these rules after ``apply_to``:
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index c2d1bc1..a97e934 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -811,10 +811,10 @@ bool AArch64TargetInfo::validateCpuSupports(StringRef FeatureStr) const {
 
 bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
-      .Cases("aarch64", "arm64", "arm", true)
+      .Cases({"aarch64", "arm64", "arm"}, true)
       .Case("fmv", HasFMV)
       .Case("fp", FPU & FPUMode)
-      .Cases("neon", "simd", FPU & NeonMode)
+      .Cases({"neon", "simd"}, FPU & NeonMode)
       .Case("jscvt", HasJSCVT)
       .Case("fcma", HasFCMA)
       .Case("rng", HasRandGen)
@@ -829,8 +829,8 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
       .Case("cssc", HasCSSC)
       .Case("sha2", HasSHA2)
       .Case("sha3", HasSHA3)
-      .Cases("aes", "pmull", HasAES)
-      .Cases("fp16", "fullfp16", HasFullFP16)
+      .Cases({"aes", "pmull"}, HasAES)
+      .Cases({"fp16", "fullfp16"}, HasFullFP16)
       .Case("dit", HasDIT)
       .Case("dpb", HasCCPP)
       .Case("dpb2", HasCCDP)
@@ -859,9 +859,9 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
       .Case("memtag", HasMTE)
       .Case("sb", HasSB)
       .Case("predres", HasPredRes)
-      .Cases("ssbs", "ssbs2", HasSSBS)
+      .Cases({"ssbs", "ssbs2"}, HasSSBS)
       .Case("bti", HasBTI)
-      .Cases("ls64", "ls64_v", "ls64_accdata", HasLS64)
+      .Cases({"ls64", "ls64_v", "ls64_accdata"}, HasLS64)
       .Case("wfxt", HasWFxT)
       .Case("rcpc3", HasRCPC3)
       .Case("fp8", HasFP8)
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index 9f3a4cd..846b240 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -122,41 +122,41 @@ public:
               .Case("970", ArchDefineName | ArchDefinePwr4 | ArchDefinePpcgr |
                                ArchDefinePpcsq)
               .Case("a2", ArchDefineA2)
-              .Cases("power3", "pwr3", ArchDefinePpcgr)
-              .Cases("power4", "pwr4",
+              .Cases({"power3", "pwr3"}, ArchDefinePpcgr)
+              .Cases({"power4", "pwr4"},
                      ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
               .Cases("power5", "pwr5",
                      ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
                          ArchDefinePpcsq)
-              .Cases("power5x", "pwr5x",
+              .Cases({"power5x", "pwr5x"},
                      ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
                          ArchDefinePpcgr | ArchDefinePpcsq)
-              .Cases("power6", "pwr6",
-                     ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
-                         ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
-              .Cases("power6x", "pwr6x",
+              .Cases({"power6", "pwr6"}, ArchDefinePwr6 | ArchDefinePwr5x |
+                                             ArchDefinePwr5 | ArchDefinePwr4 |
+                                             ArchDefinePpcgr | ArchDefinePpcsq)
+              .Cases({"power6x", "pwr6x"},
                      ArchDefinePwr6x | ArchDefinePwr6 | ArchDefinePwr5x |
                          ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
                          ArchDefinePpcsq)
-              .Cases("power7", "pwr7",
-                     ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x |
-                         ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
-                         ArchDefinePpcsq)
+              .Cases({"power7", "pwr7"}, ArchDefinePwr7 | ArchDefinePwr6 |
+                                             ArchDefinePwr5x | ArchDefinePwr5 |
+                                             ArchDefinePwr4 | ArchDefinePpcgr |
+                                             ArchDefinePpcsq)
               // powerpc64le automatically defaults to at least power8.
-              .Cases("power8", "pwr8", "ppc64le",
+              .Cases({"power8", "pwr8", "ppc64le"},
                      ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6 |
                          ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
                          ArchDefinePpcgr | ArchDefinePpcsq)
-              .Cases("power9", "pwr9",
+              .Cases({"power9", "pwr9"},
                      ArchDefinePwr9 | ArchDefinePwr8 | ArchDefinePwr7 |
                          ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
                          ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
-              .Cases("power10", "pwr10",
+              .Cases({"power10", "pwr10"},
                      ArchDefinePwr10 | ArchDefinePwr9 | ArchDefinePwr8 |
                          ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x |
                          ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
                          ArchDefinePpcsq)
-              .Cases("power11", "pwr11",
+              .Cases({"power11", "pwr11"},
                      ArchDefinePwr11 | ArchDefinePwr10 | ArchDefinePwr9 |
                          ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6 |
                          ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index d2cb751..bd36eb4 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3214,7 +3214,7 @@ static bool ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
       DashX = llvm::StringSwitch<InputKind>(XValue)
                   .Case("cpp-output", InputKind(Language::C).getPreprocessed())
                   .Case("assembler-with-cpp", Language::Asm)
-                  .Cases("ast", "pcm", "precompiled-header",
+                  .Cases({"ast", "pcm", "precompiled-header"},
                          InputKind(Language::Unknown, InputKind::Precompiled))
                   .Case("ir", Language::LLVM_IR)
                   .Case("cir", Language::CIR)
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 5b08d7f..1c06152 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1283,10 +1283,10 @@ bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
   // Per tweaked N3660, "il", "i", and "if" are also used in the library.
   // In C++2a "d" and "y" are used in the library.
   return llvm::StringSwitch<bool>(Suffix)
-      .Cases("h", "min", "s", true)
-      .Cases("ms", "us", "ns", true)
-      .Cases("il", "i", "if", true)
-      .Cases("d", "y", LangOpts.CPlusPlus20)
+      .Cases({"h", "min", "s"}, true)
+      .Cases({"ms", "us", "ns"}, true)
+      .Cases({"il", "i", "if"}, true)
+      .Cases({"d", "y"}, LangOpts.CPlusPlus20)
       .Default(false);
 }
 
diff --git a/llvm/docs/CIBestPractices.rst b/llvm/docs/CIBestPractices.rst
index da92ed3..855e2cc 100644
--- a/llvm/docs/CIBestPractices.rst
+++ b/llvm/docs/CIBestPractices.rst
@@ -136,3 +136,20 @@ branches as follows:
     branches:
       - main
       - releases/*
+
+Container Best Practices
+========================
+
+This section contains best practices/guidelines when working with containers
+for LLVM infrastructure.
+
+Using Fully Qualified Container Names
+-------------------------------------
+
+When referencing container images from a registry, such as in Github Actions
+workflows, or in ``Dockerfile`` files used for building images, prefer fully
+qualified names (i.e., including the registry domain) over just the image.
+For example, prefer ``docker.io/ubuntu:24.04`` over ``ubuntu:24.04``. This
+ensures portability across systems where a different default registry might
+be specified and also prevents attackers from changing the default registry
+to pull in a malicious image instead of the intended one.
diff --git a/llvm/include/llvm/ADT/Bitfields.h b/llvm/include/llvm/ADT/Bitfields.h
index 1fbc41c..be9546e 100644
--- a/llvm/include/llvm/ADT/Bitfields.h
+++ b/llvm/include/llvm/ADT/Bitfields.h
@@ -100,8 +100,8 @@ template <typename Bitfield, typename StorageType> struct Impl {
   using IntegerType = typename Bitfield::IntegerType;
 
   static constexpr size_t StorageBits = sizeof(StorageType) * CHAR_BIT;
-  static_assert(Bitfield::FirstBit <= StorageBits, "Data must fit in mask");
-  static_assert(Bitfield::LastBit <= StorageBits, "Data must fit in mask");
+  static_assert(Bitfield::FirstBit < StorageBits, "Data must fit in mask");
+  static_assert(Bitfield::LastBit < StorageBits, "Data must fit in mask");
   static constexpr StorageType LowMask =
       maskTrailingOnes<StorageType>(Bitfield::Bits);
   static constexpr StorageType Mask = LowMask << Bitfield::Shift;
diff --git a/llvm/include/llvm/ADT/ConcurrentHashtable.h b/llvm/include/llvm/ADT/ConcurrentHashtable.h
index 6a943c5..0cc03cf 100644
--- a/llvm/include/llvm/ADT/ConcurrentHashtable.h
+++ b/llvm/include/llvm/ADT/ConcurrentHashtable.h
@@ -177,7 +177,7 @@ public:
 
 #if LLVM_ENABLE_THREADS
     // Lock bucket.
-    CurBucket.Guard.lock();
+    std::scoped_lock<std::mutex> Lock(CurBucket.Guard);
 #endif
 
     HashesPtr BucketHashes = CurBucket.Hashes;
@@ -195,11 +195,6 @@ public:
 
         CurBucket.NumberOfEntries++;
         RehashBucket(CurBucket);
-
-#if LLVM_ENABLE_THREADS
-        CurBucket.Guard.unlock();
-#endif
-
         return {NewData, true};
       }
 
@@ -208,10 +203,6 @@ public:
         KeyDataTy *EntryData = BucketEntries[CurEntryIdx];
         if (Info::isEqual(Info::getKey(*EntryData), NewValue)) {
           // Already existed entry matched with inserted data is found.
-#if LLVM_ENABLE_THREADS
-          CurBucket.Guard.unlock();
-#endif
-
           return {EntryData, false};
         }
       }
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 25b5262..22ef7ed 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -130,13 +130,13 @@ public:
       return;
     }
 
-    const KeyT EmptyKey = getEmptyKey();
+    const KeyT EmptyKey = KeyInfoT::getEmptyKey();
     if constexpr (std::is_trivially_destructible_v<ValueT>) {
       // Use a simpler loop when values don't need destruction.
       for (BucketT &B : buckets())
         B.getFirst() = EmptyKey;
     } else {
-      const KeyT TombstoneKey = getTombstoneKey();
+      const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
       unsigned NumEntries = getNumEntries();
       for (BucketT &B : buckets()) {
         if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey)) {
@@ -154,6 +154,17 @@ public:
     setNumTombstones(0);
   }
 
+  void shrink_and_clear() {
+    auto [Reallocate, NewNumBuckets] = derived().planShrinkAndClear();
+    destroyAll();
+    if (!Reallocate) {
+      initEmpty();
+      return;
+    }
+    derived().deallocateBuckets();
+    derived().init(NewNumBuckets);
+  }
+
   /// Return true if the specified key is in the map, false otherwise.
   [[nodiscard]] bool contains(const_arg_type_t<KeyT> Val) const {
     return doFind(Val) != nullptr;
@@ -314,7 +325,7 @@ public:
       return false; // not in map.
 
     TheBucket->getSecond().~ValueT();
-    TheBucket->getFirst() = getTombstoneKey();
+    TheBucket->getFirst() = KeyInfoT::getTombstoneKey();
     decrementNumEntries();
     incrementNumTombstones();
     return true;
@@ -322,7 +333,7 @@ public:
   void erase(iterator I) {
     BucketT *TheBucket = &*I;
     TheBucket->getSecond().~ValueT();
-    TheBucket->getFirst() = getTombstoneKey();
+    TheBucket->getFirst() = KeyInfoT::getTombstoneKey();
     decrementNumEntries();
     incrementNumTombstones();
   }
@@ -353,10 +364,17 @@ protected:
   DenseMapBase() = default;
 
   void destroyAll() {
+    // No need to iterate through the buckets if both KeyT and ValueT are
+    // trivially destructible.
+    if constexpr (std::is_trivially_destructible_v<KeyT> &&
+                  std::is_trivially_destructible_v<ValueT>)
+      return;
+
     if (getNumBuckets() == 0) // Nothing to do.
       return;
 
-    const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
+    const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+    const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
     for (BucketT &B : buckets()) {
       if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) &&
           !KeyInfoT::isEqual(B.getFirst(), TombstoneKey))
@@ -366,12 +384,14 @@ protected:
   }
 
   void initEmpty() {
+    static_assert(std::is_base_of_v<DenseMapBase, DerivedT>,
+                  "Must pass the derived type to this template!");
     setNumEntries(0);
     setNumTombstones(0);
 
     assert((getNumBuckets() & (getNumBuckets() - 1)) == 0 &&
            "# initial buckets must be a power of two!");
-    const KeyT EmptyKey = getEmptyKey();
+    const KeyT EmptyKey = KeyInfoT::getEmptyKey();
     for (BucketT &B : buckets())
       ::new (&B.getFirst()) KeyT(EmptyKey);
   }
@@ -391,8 +411,8 @@ protected:
     initEmpty();
 
     // Insert all the old elements.
-    const KeyT EmptyKey = getEmptyKey();
-    const KeyT TombstoneKey = getTombstoneKey();
+    const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+    const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
     for (BucketT &B : OldBuckets) {
       if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) &&
           !KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) {
@@ -412,9 +432,16 @@ protected:
     }
   }
 
-  template <typename OtherBaseT>
-  void copyFrom(
-      const DenseMapBase<OtherBaseT, KeyT, ValueT, KeyInfoT, BucketT> &other) {
+  void copyFrom(const DerivedT &other) {
+    this->destroyAll();
+    derived().deallocateBuckets();
+    setNumEntries(0);
+    setNumTombstones(0);
+    if (!derived().allocateBuckets(other.getNumBuckets())) {
+      // The bucket list is empty.  No work to do.
+      return;
+    }
+
     assert(&other != this);
     assert(getNumBuckets() == other.getNumBuckets());
 
@@ -429,8 +456,8 @@ protected:
       memcpy(reinterpret_cast<void *>(Buckets), OtherBuckets,
              NumBuckets * sizeof(BucketT));
     } else {
-      const KeyT EmptyKey = getEmptyKey();
-      const KeyT TombstoneKey = getTombstoneKey();
+      const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+      const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
       for (size_t I = 0; I < NumBuckets; ++I) {
         ::new (&Buckets[I].getFirst()) KeyT(OtherBuckets[I].getFirst());
         if (!KeyInfoT::isEqual(Buckets[I].getFirst(), EmptyKey) &&
@@ -440,19 +467,6 @@ protected:
     }
   }
 
-  template <typename LookupKeyT>
-  static unsigned getHashValue(const LookupKeyT &Val) {
-    return KeyInfoT::getHashValue(Val);
-  }
-
-  static const KeyT getEmptyKey() {
-    static_assert(std::is_base_of_v<DenseMapBase, DerivedT>,
-                  "Must pass the derived type to this template!");
-    return KeyInfoT::getEmptyKey();
-  }
-
-  static const KeyT getTombstoneKey() { return KeyInfoT::getTombstoneKey(); }
-
 private:
   DerivedT &derived() { return *static_cast<DerivedT *>(this); }
   const DerivedT &derived() const {
@@ -526,8 +540,6 @@ private:
 
   void grow(unsigned AtLeast) { derived().grow(AtLeast); }
 
-  void shrink_and_clear() { derived().shrink_and_clear(); }
-
   template <typename LookupKeyT>
   BucketT *findBucketForInsertion(const LookupKeyT &Lookup,
                                   BucketT *TheBucket) {
@@ -560,7 +572,7 @@ private:
     incrementNumEntries();
 
     // If we are writing over a tombstone, remember this.
-    const KeyT EmptyKey = getEmptyKey();
+    const KeyT EmptyKey = KeyInfoT::getEmptyKey();
     if (!KeyInfoT::isEqual(TheBucket->getFirst(), EmptyKey))
       decrementNumTombstones();
 
@@ -574,8 +586,8 @@ private:
     if (NumBuckets == 0)
       return nullptr;
 
-    const KeyT EmptyKey = getEmptyKey();
-    unsigned BucketNo = getHashValue(Val) & (NumBuckets - 1);
+    const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+    unsigned BucketNo = KeyInfoT::getHashValue(Val) & (NumBuckets - 1);
     unsigned ProbeAmt = 1;
     while (true) {
       const BucketT *Bucket = BucketsPtr + BucketNo;
@@ -612,13 +624,13 @@ private:
 
     // FoundTombstone - Keep track of whether we find a tombstone while probing.
     BucketT *FoundTombstone = nullptr;
-    const KeyT EmptyKey = getEmptyKey();
-    const KeyT TombstoneKey = getTombstoneKey();
+    const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+    const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
     assert(!KeyInfoT::isEqual(Val, EmptyKey) &&
            !KeyInfoT::isEqual(Val, TombstoneKey) &&
            "Empty/Tombstone value shouldn't be inserted into map!");
 
-    unsigned BucketNo = getHashValue(Val) & (NumBuckets - 1);
+    unsigned BucketNo = KeyInfoT::getHashValue(Val) & (NumBuckets - 1);
     unsigned ProbeAmt = 1;
     while (true) {
       BucketT *ThisBucket = BucketsPtr + BucketNo;
@@ -719,7 +731,7 @@ public:
 
   DenseMap(const DenseMap &other) : BaseT() {
     init(0);
-    copyFrom(other);
+    this->copyFrom(other);
   }
 
   DenseMap(DenseMap &&other) : BaseT() {
@@ -755,7 +767,7 @@ public:
 
   DenseMap &operator=(const DenseMap &other) {
     if (&other != this)
-      copyFrom(other);
+      this->copyFrom(other);
     return *this;
   }
 
@@ -767,56 +779,6 @@ public:
     return *this;
   }
 
-  void copyFrom(const DenseMap &other) {
-    this->destroyAll();
-    deallocateBuckets();
-    if (allocateBuckets(other.NumBuckets)) {
-      this->BaseT::copyFrom(other);
-    } else {
-      NumEntries = 0;
-      NumTombstones = 0;
-    }
-  }
-
-  void grow(unsigned AtLeast) {
-    unsigned OldNumBuckets = NumBuckets;
-    BucketT *OldBuckets = Buckets;
-
-    allocateBuckets(std::max<unsigned>(
-        64, static_cast<unsigned>(NextPowerOf2(AtLeast - 1))));
-    assert(Buckets);
-    if (!OldBuckets) {
-      this->BaseT::initEmpty();
-      return;
-    }
-
-    this->moveFromOldBuckets(
-        llvm::make_range(OldBuckets, OldBuckets + OldNumBuckets));
-
-    // Free the old table.
-    deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets,
-                      alignof(BucketT));
-  }
-
-  void shrink_and_clear() {
-    unsigned OldNumBuckets = NumBuckets;
-    unsigned OldNumEntries = NumEntries;
-    this->destroyAll();
-
-    // Reduce the number of buckets.
-    unsigned NewNumBuckets = 0;
-    if (OldNumEntries)
-      NewNumBuckets = std::max(64, 1 << (Log2_32_Ceil(OldNumEntries) + 1));
-    if (NewNumBuckets == NumBuckets) {
-      this->BaseT::initEmpty();
-      return;
-    }
-
-    deallocate_buffer(Buckets, sizeof(BucketT) * OldNumBuckets,
-                      alignof(BucketT));
-    init(NewNumBuckets);
-  }
-
 private:
   unsigned getNumEntries() const { return NumEntries; }
 
@@ -855,6 +817,38 @@ private:
       NumTombstones = 0;
     }
   }
+
+  void grow(unsigned AtLeast) {
+    unsigned OldNumBuckets = NumBuckets;
+    BucketT *OldBuckets = Buckets;
+
+    allocateBuckets(std::max<unsigned>(
+        64, static_cast<unsigned>(NextPowerOf2(AtLeast - 1))));
+    assert(Buckets);
+    if (!OldBuckets) {
+      this->BaseT::initEmpty();
+      return;
+    }
+
+    this->moveFromOldBuckets(
+        llvm::make_range(OldBuckets, OldBuckets + OldNumBuckets));
+
+    // Free the old table.
+    deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets,
+                      alignof(BucketT));
+  }
+
+  // Plan how to shrink the bucket table.  Return:
+  // - {false, 0} to reuse the existing bucket table
+  // - {true, N} to reallocate a bucket table with N entries
+  std::pair<bool, unsigned> planShrinkAndClear() const {
+    unsigned NewNumBuckets = 0;
+    if (NumEntries)
+      NewNumBuckets = std::max(64u, 1u << (Log2_32_Ceil(NumEntries) + 1));
+    if (NewNumBuckets == NumBuckets)
+      return {false, 0};          // Reuse.
+    return {true, NewNumBuckets}; // Reallocate.
+  }
 };
 
 template <typename KeyT, typename ValueT, unsigned InlineBuckets = 4,
@@ -896,7 +890,7 @@ public:
 
   SmallDenseMap(const SmallDenseMap &other) : BaseT() {
     init(0);
-    copyFrom(other);
+    this->copyFrom(other);
   }
 
   SmallDenseMap(SmallDenseMap &&other) : BaseT() {
@@ -928,8 +922,8 @@ public:
     NumEntries = TmpNumEntries;
     std::swap(NumTombstones, RHS.NumTombstones);
 
-    const KeyT EmptyKey = this->getEmptyKey();
-    const KeyT TombstoneKey = this->getTombstoneKey();
+    const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+    const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
     if (Small && RHS.Small) {
       // If we're swapping inline bucket arrays, we have to cope with some of
       // the tricky bits of DenseMap's storage system: the buckets are not
@@ -995,7 +989,7 @@ public:
 
   SmallDenseMap &operator=(const SmallDenseMap &other) {
     if (&other != this)
-      copyFrom(other);
+      this->copyFrom(other);
     return *this;
   }
 
@@ -1007,86 +1001,6 @@ public:
     return *this;
   }
 
-  void copyFrom(const SmallDenseMap &other) {
-    this->destroyAll();
-    deallocateBuckets();
-    allocateBuckets(other.getNumBuckets());
-    this->BaseT::copyFrom(other);
-  }
-
-  void init(unsigned InitNumEntries) {
-    auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries);
-    allocateBuckets(InitBuckets);
-    this->BaseT::initEmpty();
-  }
-
-  void grow(unsigned AtLeast) {
-    if (AtLeast > InlineBuckets)
-      AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast - 1));
-
-    if (Small) {
-      // First move the inline buckets into a temporary storage.
-      AlignedCharArrayUnion<BucketT[InlineBuckets]> TmpStorage;
-      BucketT *TmpBegin = reinterpret_cast<BucketT *>(&TmpStorage);
-      BucketT *TmpEnd = TmpBegin;
-
-      // Loop over the buckets, moving non-empty, non-tombstones into the
-      // temporary storage. Have the loop move the TmpEnd forward as it goes.
-      const KeyT EmptyKey = this->getEmptyKey();
-      const KeyT TombstoneKey = this->getTombstoneKey();
-      for (BucketT &B : inlineBuckets()) {
-        if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) &&
-            !KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) {
-          assert(size_t(TmpEnd - TmpBegin) < InlineBuckets &&
-                 "Too many inline buckets!");
-          ::new (&TmpEnd->getFirst()) KeyT(std::move(B.getFirst()));
-          ::new (&TmpEnd->getSecond()) ValueT(std::move(B.getSecond()));
-          ++TmpEnd;
-          B.getSecond().~ValueT();
-        }
-        B.getFirst().~KeyT();
-      }
-
-      // AtLeast == InlineBuckets can happen if there are many tombstones,
-      // and grow() is used to remove them. Usually we always switch to the
-      // large rep here.
-      allocateBuckets(AtLeast);
-      this->moveFromOldBuckets(llvm::make_range(TmpBegin, TmpEnd));
-      return;
-    }
-
-    LargeRep OldRep = std::move(*getLargeRep());
-    getLargeRep()->~LargeRep();
-    allocateBuckets(AtLeast);
-
-    this->moveFromOldBuckets(OldRep.buckets());
-
-    // Free the old table.
-    deallocate_buffer(OldRep.Buckets, sizeof(BucketT) * OldRep.NumBuckets,
-                      alignof(BucketT));
-  }
-
-  void shrink_and_clear() {
-    unsigned OldSize = this->size();
-    this->destroyAll();
-
-    // Reduce the number of buckets.
-    unsigned NewNumBuckets = 0;
-    if (OldSize) {
-      NewNumBuckets = 1 << (Log2_32_Ceil(OldSize) + 1);
-      if (NewNumBuckets > InlineBuckets && NewNumBuckets < 64u)
-        NewNumBuckets = 64;
-    }
-    if ((Small && NewNumBuckets <= InlineBuckets) ||
-        (!Small && NewNumBuckets == getLargeRep()->NumBuckets)) {
-      this->BaseT::initEmpty();
-      return;
-    }
-
-    deallocateBuckets();
-    init(NewNumBuckets);
-  }
-
 private:
   unsigned getNumEntries() const { return NumEntries; }
 
@@ -1152,7 +1066,7 @@ private:
     getLargeRep()->~LargeRep();
   }
 
-  void allocateBuckets(unsigned Num) {
+  bool allocateBuckets(unsigned Num) {
     if (Num <= InlineBuckets) {
       Small = true;
     } else {
@@ -1161,6 +1075,76 @@ private:
           allocate_buffer(sizeof(BucketT) * Num, alignof(BucketT)));
       new (getLargeRep()) LargeRep{NewBuckets, Num};
     }
+    return true;
+  }
+
+  void init(unsigned InitNumEntries) {
+    auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries);
+    allocateBuckets(InitBuckets);
+    this->BaseT::initEmpty();
+  }
+
+  void grow(unsigned AtLeast) {
+    if (AtLeast > InlineBuckets)
+      AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast - 1));
+
+    if (Small) {
+      // First move the inline buckets into a temporary storage.
+      AlignedCharArrayUnion<BucketT[InlineBuckets]> TmpStorage;
+      BucketT *TmpBegin = reinterpret_cast<BucketT *>(&TmpStorage);
+      BucketT *TmpEnd = TmpBegin;
+
+      // Loop over the buckets, moving non-empty, non-tombstones into the
+      // temporary storage. Have the loop move the TmpEnd forward as it goes.
+      const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+      const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
+      for (BucketT &B : inlineBuckets()) {
+        if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) &&
+            !KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) {
+          assert(size_t(TmpEnd - TmpBegin) < InlineBuckets &&
+                 "Too many inline buckets!");
+          ::new (&TmpEnd->getFirst()) KeyT(std::move(B.getFirst()));
+          ::new (&TmpEnd->getSecond()) ValueT(std::move(B.getSecond()));
+          ++TmpEnd;
+          B.getSecond().~ValueT();
+        }
+        B.getFirst().~KeyT();
+      }
+
+      // AtLeast == InlineBuckets can happen if there are many tombstones,
+      // and grow() is used to remove them. Usually we always switch to the
+      // large rep here.
+      allocateBuckets(AtLeast);
+      this->moveFromOldBuckets(llvm::make_range(TmpBegin, TmpEnd));
+      return;
+    }
+
+    LargeRep OldRep = std::move(*getLargeRep());
+    getLargeRep()->~LargeRep();
+    allocateBuckets(AtLeast);
+
+    this->moveFromOldBuckets(OldRep.buckets());
+
+    // Free the old table.
+    deallocate_buffer(OldRep.Buckets, sizeof(BucketT) * OldRep.NumBuckets,
+                      alignof(BucketT));
+  }
+
+  // Plan how to shrink the bucket table.  Return:
+  // - {false, 0} to reuse the existing bucket table
+  // - {true, N} to reallocate a bucket table with N entries
+  std::pair<bool, unsigned> planShrinkAndClear() const {
+    unsigned NewNumBuckets = 0;
+    if (!this->empty()) {
+      NewNumBuckets = 1u << (Log2_32_Ceil(this->size()) + 1);
+      if (NewNumBuckets > InlineBuckets)
+        NewNumBuckets = std::max(64u, NewNumBuckets);
+    }
+    bool Reuse = Small ? NewNumBuckets <= InlineBuckets
+                       : NewNumBuckets == getLargeRep()->NumBuckets;
+    if (Reuse)
+      return {false, 0};          // Reuse.
+    return {true, NewNumBuckets}; // Reallocate.
   }
 };
 
diff --git a/llvm/include/llvm/Support/Recycler.h b/llvm/include/llvm/Support/Recycler.h
index b51c586..6502a70 100644
--- a/llvm/include/llvm/Support/Recycler.h
+++ b/llvm/include/llvm/Support/Recycler.h
@@ -19,6 +19,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
+#include <type_traits>
 
 namespace llvm {
 
@@ -72,19 +73,19 @@ public:
   /// deleted; calling clear is one way to ensure this.
   template<class AllocatorType>
   void clear(AllocatorType &Allocator) {
-    while (FreeList) {
-      T *t = reinterpret_cast<T *>(pop_val());
-      Allocator.Deallocate(t, Size, Align);
+    if constexpr (std::is_same_v<std::decay_t<AllocatorType>,
+                                 BumpPtrAllocator>) {
+      // For BumpPtrAllocator, Deallocate is a no-op, so just drop the free
+      // list.
+      FreeList = nullptr;
+    } else {
+      while (FreeList) {
+        T *t = reinterpret_cast<T *>(pop_val());
+        Allocator.Deallocate(t, Size, Align);
+      }
     }
   }
 
-  /// Special case for BumpPtrAllocator which has an empty Deallocate()
-  /// function.
-  ///
-  /// There is no need to traverse the free list, pulling all the objects into
-  /// cache.
-  void clear(BumpPtrAllocator &) { FreeList = nullptr; }
-
   template<class SubClass, class AllocatorType>
   SubClass *Allocate(AllocatorType &Allocator) {
     static_assert(alignof(SubClass) <= Align,
diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h
index a235975..860f73c 100644
--- a/llvm/include/llvm/Support/SpecialCaseList.h
+++ b/llvm/include/llvm/Support/SpecialCaseList.h
@@ -167,8 +167,9 @@ private:
     std::vector<GlobMatcher::Glob> Globs;
 
     RadixTree<iterator_range<StringRef::const_iterator>,
-              SmallVector<const GlobMatcher::Glob *, 1>>
-        PrefixToGlob;
+              RadixTree<iterator_range<StringRef::const_reverse_iterator>,
+                        SmallVector<const GlobMatcher::Glob *, 1>>>
+        PrefixSuffixToGlob;
   };
 
   /// Represents a set of patterns and their line numbers
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 178529f..52c43a4 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -9084,6 +9084,8 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
   SmallVector<Register, 32> BuildVec;
   LLT EltTy = DstTy.getScalarType();
 
+  DenseMap<unsigned, Register> CachedExtract;
+
   for (int Idx : Mask) {
     if (Idx < 0) {
       if (!Undef.isValid())
@@ -9097,9 +9099,13 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
     int NumElts = Src0Ty.getNumElements();
     Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
     int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
-    auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
-    auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
-    BuildVec.push_back(Extract.getReg(0));
+    auto [It, Inserted] = CachedExtract.try_emplace(Idx);
+    if (Inserted) {
+      auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
+      It->second =
+          MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK).getReg(0);
+    }
+    BuildVec.push_back(It->second);
   }
 
   assert(DstTy.isVector() && "Unexpected scalar G_SHUFFLE_VECTOR");
diff --git a/llvm/lib/CodeGen/RegisterUsageInfo.cpp b/llvm/lib/CodeGen/RegisterUsageInfo.cpp
index 7a4628a..2ef380f 100644
--- a/llvm/lib/CodeGen/RegisterUsageInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterUsageInfo.cpp
@@ -44,7 +44,7 @@ void PhysicalRegisterUsageInfo::setTargetMachine(const TargetMachine &TM) {
 }
 
 bool PhysicalRegisterUsageInfo::doInitialization(Module &M) {
-  RegMasks.grow(M.size());
+  RegMasks.reserve(M.size());
   return false;
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index dee0909..a522650 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2015,9 +2015,9 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     Register InReg = FuncInfo.InitializeRegForValue(Inst);
 
     std::optional<CallingConv::ID> CallConv;
-    auto *CI = dyn_cast<CallInst>(Inst);
-    if (CI && !CI->isInlineAsm())
-      CallConv = CI->getCallingConv();
+    auto *CB = dyn_cast<CallBase>(Inst);
+    if (CB && !CB->isInlineAsm())
+      CallConv = CB->getCallingConv();
 
     RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg,
                      Inst->getType(), CallConv);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 920dff9..da4e409 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9899,6 +9899,18 @@ SDValue TargetLowering::expandBSWAP(SDNode *N, SelectionDAG &DAG) const {
     // Use a rotate by 8. This can be further expanded if necessary.
     return DAG.getNode(ISD::ROTL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
   case MVT::i32:
+    // This is meant for ARM speficially, which has ROTR but no ROTL.
+    if (isOperationLegalOrCustom(ISD::ROTR, VT)) {
+      SDValue Mask = DAG.getConstant(0x00FF00FF, dl, VT);
+      // (x & 0x00FF00FF) rotr 8 | (x rotl 8) & 0x00FF00FF
+      SDValue And = DAG.getNode(ISD::AND, dl, VT, Op, Mask);
+      SDValue Rotr =
+          DAG.getNode(ISD::ROTR, dl, VT, And, DAG.getConstant(8, dl, SHVT));
+      SDValue Rotl =
+          DAG.getNode(ISD::ROTR, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
+      SDValue And2 = DAG.getNode(ISD::AND, dl, VT, Rotl, Mask);
+      return DAG.getNode(ISD::OR, dl, VT, Rotr, And2);
+    }
     Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
     Tmp3 = DAG.getNode(ISD::AND, dl, VT, Op,
                        DAG.getConstant(0xFF00, dl, VT));
diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
index 049efe8..c33bf8b 100644
--- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
@@ -44,7 +44,7 @@ bool TargetOptions::FramePointerIsReserved(const MachineFunction &MF) const {
     return false;
 
   return StringSwitch<bool>(FPAttr.getValueAsString())
-      .Cases("all", "non-leaf", "reserved", true)
+      .Cases({"all", "non-leaf", "reserved"}, true)
       .Case("none", false);
 }
 
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index d4901d9..8a8f111 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -5844,11 +5844,11 @@ bool MasmParser::lookUpField(const StructInfo &Structure, StringRef Member,
 
 bool MasmParser::lookUpType(StringRef Name, AsmTypeInfo &Info) const {
   unsigned Size = StringSwitch<unsigned>(Name)
-                      .CasesLower("byte", "db", "sbyte", 1)
-                      .CasesLower("word", "dw", "sword", 2)
-                      .CasesLower("dword", "dd", "sdword", 4)
-                      .CasesLower("fword", "df", 6)
-                      .CasesLower("qword", "dq", "sqword", 8)
+                      .CasesLower({"byte", "db", "sbyte"}, 1)
+                      .CasesLower({"word", "dw", "sword"}, 2)
+                      .CasesLower({"dword", "dd", "sdword"}, 4)
+                      .CasesLower({"fword", "df"}, 6)
+                      .CasesLower({"qword", "dq", "sqword"}, 8)
                       .CaseLower("real4", 4)
                       .CaseLower("real8", 8)
                       .CaseLower("real10", 10)
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index c27f627..3a97185 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -92,8 +92,10 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) {
 
   for (const auto &G : reverse(Globs)) {
     StringRef Prefix = G.Pattern.prefix();
+    StringRef Suffix = G.Pattern.suffix();
 
-    auto &V = PrefixToGlob.emplace(Prefix).first->second;
+    auto &SToGlob = PrefixSuffixToGlob.emplace(Prefix).first->second;
+    auto &V = SToGlob.emplace(reverse(Suffix)).first->second;
     V.emplace_back(&G);
   }
 }
@@ -101,16 +103,18 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) {
 void SpecialCaseList::GlobMatcher::match(
     StringRef Query,
     llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
-  if (!PrefixToGlob.empty()) {
-    for (const auto &[_, V] : PrefixToGlob.find_prefixes(Query)) {
-      for (const auto *G : V) {
-        if (G->Pattern.match(Query)) {
-          Cb(G->Name, G->LineNo);
-          // As soon as we find a match in the vector, we can break for this
-          // vector, since the globs are already sorted by priority within the
-          // prefix group. However, we continue searching other prefix groups in
-          // the map, as they may contain a better match overall.
-          break;
+  if (!PrefixSuffixToGlob.empty()) {
+    for (const auto &[_, SToGlob] : PrefixSuffixToGlob.find_prefixes(Query)) {
+      for (const auto &[_, V] : SToGlob.find_prefixes(reverse(Query))) {
+        for (const auto *G : V) {
+          if (G->Pattern.match(Query)) {
+            Cb(G->Name, G->LineNo);
+            // As soon as we find a match in the vector, we can break for this
+            // vector, since the globs are already sorted by priority within the
+            // prefix group. However, we continue searching other prefix groups
+            // in the map, as they may contain a better match overall.
+            break;
+          }
         }
       }
     }
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index d1832f4..f680a5e 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -651,7 +651,7 @@ struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo {
   AArch64FunctionInfo(const llvm::AArch64FunctionInfo &MFI);
 
   void mappingImpl(yaml::IO &YamlIO) override;
-  ~AArch64FunctionInfo() = default;
+  ~AArch64FunctionInfo() override = default;
 };
 
 template <> struct MappingTraits<AArch64FunctionInfo> {
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 72a7676..47d76f3 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -154,7 +154,7 @@ public:
 
   bool shouldAnalyzePhysregInMachineLoopInfo(MCRegister R) const override;
 
-  virtual bool isIgnoredCVReg(MCRegister LLVMReg) const override;
+  bool isIgnoredCVReg(MCRegister LLVMReg) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 1b4b113..6bad4dbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -131,7 +131,7 @@ protected:
 
 public:
   MetadataStreamerMsgPackV4() = default;
-  ~MetadataStreamerMsgPackV4() = default;
+  ~MetadataStreamerMsgPackV4() override = default;
 
   bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
 
@@ -154,7 +154,7 @@ protected:
 
 public:
   MetadataStreamerMsgPackV5() = default;
-  ~MetadataStreamerMsgPackV5() = default;
+  ~MetadataStreamerMsgPackV5() override = default;
 };
 
 class MetadataStreamerMsgPackV6 final : public MetadataStreamerMsgPackV5 {
@@ -163,7 +163,7 @@ protected:
 
 public:
   MetadataStreamerMsgPackV6() = default;
-  ~MetadataStreamerMsgPackV6() = default;
+  ~MetadataStreamerMsgPackV6() override = default;
 
   void emitKernelAttrs(const AMDGPUTargetMachine &TM, const MachineFunction &MF,
                        msgpack::MapDocNode Kern) override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
index c5c9473..0804133 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
@@ -26,20 +26,19 @@ struct PerFunctionMIParsingState;
 class AMDGPUMIRFormatter final : public MIRFormatter {
 public:
   AMDGPUMIRFormatter() = default;
-  virtual ~AMDGPUMIRFormatter() = default;
+  ~AMDGPUMIRFormatter() override = default;
 
   /// Implement target specific printing for machine operand immediate value, so
   /// that we can have more meaningful mnemonic than a 64-bit integer. Passing
   /// None to OpIdx means the index is unknown.
-  virtual void printImm(raw_ostream &OS, const MachineInstr &MI,
-                        std::optional<unsigned> OpIdx,
-                        int64_t Imm) const override;
+  void printImm(raw_ostream &OS, const MachineInstr &MI,
+                std::optional<unsigned> OpIdx, int64_t Imm) const override;
 
   /// Implement target specific parsing of immediate mnemonics. The mnemonic is
   /// a string with a leading dot.
-  virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
-                                StringRef Src, int64_t &Imm,
-                                ErrorCallbackType ErrorCallback) const override;
+  bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+                        StringRef Src, int64_t &Imm,
+                        ErrorCallbackType ErrorCallback) const override;
 
   /// Implement target specific parsing of target custom pseudo source value.
   bool
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
index cbc7427..4d0c163 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
@@ -32,7 +32,7 @@ public:
   AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
       : InstrPostProcess(STI, MCII) {}
 
-  ~AMDGPUInstrPostProcess() = default;
+  ~AMDGPUInstrPostProcess() override = default;
 
   void postProcessInstruction(Instruction &Inst, const MCInst &MCI) override;
 };
@@ -88,7 +88,7 @@ public:
   AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
                         const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII);
 
-  ~AMDGPUCustomBehaviour() = default;
+  ~AMDGPUCustomBehaviour() override = default;
   /// This method is used to determine if an instruction
   /// should be allowed to be dispatched. The return value is
   /// how many cycles until the instruction can be dispatched.
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index 54fcd2a..246a3f8 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -64,7 +64,7 @@ private:
   ArrayRef<const MCExpr *> Args;
 
   AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
-  ~AMDGPUMCExpr();
+  ~AMDGPUMCExpr() override;
 
   bool evaluateExtraSGPRs(MCValue &Res, const MCAssembler *Asm) const;
   bool evaluateTotalNumVGPR(MCValue &Res, const MCAssembler *Asm) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5fdedda..dc23a21 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1640,7 +1640,7 @@ public:
                            unsigned *PredCost = nullptr) const override;
 
   InstructionUniformity
-  getInstructionUniformity(const MachineInstr &MI) const override final;
+  getInstructionUniformity(const MachineInstr &MI) const final;
 
   InstructionUniformity
   getGenericInstructionUniformity(const MachineInstr &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 2c1a13c..019c3b7 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -311,7 +311,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
                         const llvm::MachineFunction &MF);
 
   void mappingImpl(yaml::IO &YamlIO) override;
-  ~SIMachineFunctionInfo() = default;
+  ~SIMachineFunctionInfo() override = default;
 };
 
 template <> struct MappingTraits<SIMachineFunctionInfo> {
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 07264d9..a177a42 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -640,7 +640,7 @@ public:
 
   bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
 
-  virtual bool handleCooperativeAtomic(MachineInstr &MI) const override;
+  bool handleCooperativeAtomic(MachineInstr &MI) const override;
 
   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index a9c4b53..72eb3d0 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -312,7 +312,7 @@ struct ARMFunctionInfo final : public yaml::MachineFunctionInfo {
   ARMFunctionInfo(const llvm::ARMFunctionInfo &MFI);
 
   void mappingImpl(yaml::IO &YamlIO) override;
-  ~ARMFunctionInfo() = default;
+  ~ARMFunctionInfo() override = default;
 };
 
 template <> struct MappingTraits<ARMFunctionInfo> {
diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
index 3a840a3..5548ad1 100644
--- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
+++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
@@ -40,7 +40,7 @@ class AVRDisassembler : public MCDisassembler {
 public:
   AVRDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
       : MCDisassembler(STI, Ctx) {}
-  virtual ~AVRDisassembler() = default;
+  ~AVRDisassembler() override = default;
 
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
index 619efb3..03c60e8 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
@@ -24,7 +24,7 @@ class AVRELFObjectWriter : public MCELFObjectTargetWriter {
 public:
   AVRELFObjectWriter(uint8_t OSABI);
 
-  virtual ~AVRELFObjectWriter() = default;
+  ~AVRELFObjectWriter() override = default;
 
   unsigned getRelocType(const MCFixup &, const MCValue &,
                         bool IsPCRel) const override;
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.h b/llvm/lib/Target/BPF/BPFAsmPrinter.h
index 0cfb283..90ef207 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.h
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.h
@@ -32,7 +32,7 @@ public:
 
   void emitInstruction(const MachineInstr *MI) override;
   MCSymbol *getJTPublicSymbol(unsigned JTI);
-  virtual void emitJumpTableInfo() override;
+  void emitJumpTableInfo() override;
 
   static char ID;
 
diff --git a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
index e3c39a1..b12985d 100644
--- a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
+++ b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
@@ -46,7 +46,7 @@ class BPFCheckAndAdjustIR final : public ModulePass {
 public:
   static char ID;
   BPFCheckAndAdjustIR() : ModulePass(ID) {}
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
 
 private:
   void checkIR(Module &M);
diff --git a/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h b/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h
index f3064c0c..af3542e 100644
--- a/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h
+++ b/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h
@@ -16,7 +16,7 @@ namespace llvm {
 class BPFTargetLoweringObjectFileELF : public TargetLoweringObjectFileELF {
 
 public:
-  virtual MCSection *
+  MCSection *
   getSectionForJumpTable(const Function &F, const TargetMachine &TM,
                          const MachineJumpTableEntry *JTE) const override;
 };
diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
index 4aecaf1..8e791e6 100644
--- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -635,7 +635,7 @@ public:
     B.setChangeObserver(*this);
   }
 
-  ~InstManager() { B.stopObservingChanges(); }
+  ~InstManager() override { B.stopObservingChanges(); }
 
   void createdInstr(MachineInstr &MI) override { InstList.insert(&MI); }
   void erasingInstr(MachineInstr &MI) override {}
diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h
index 34efa0b..4ccd3cf 100644
--- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h
+++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h
@@ -33,7 +33,7 @@ public:
 
   explicit RISCVLMULInstrument(StringRef Data) : Instrument(DESC_NAME, Data) {}
 
-  ~RISCVLMULInstrument() = default;
+  ~RISCVLMULInstrument() override = default;
 
   uint8_t getLMUL() const;
 };
@@ -45,7 +45,7 @@ public:
 
   explicit RISCVSEWInstrument(StringRef Data) : Instrument(DESC_NAME, Data) {}
 
-  ~RISCVSEWInstrument() = default;
+  ~RISCVSEWInstrument() override = default;
 
   uint8_t getSEW() const;
 };
diff --git a/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h b/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h
index b69904d..b2ce57a 100644
--- a/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h
+++ b/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h
@@ -37,7 +37,7 @@ private:
   RISCVCPKind Kind;
 
 public:
-  ~RISCVConstantPoolValue() = default;
+  ~RISCVConstantPoolValue() override = default;
 
   static RISCVConstantPoolValue *Create(const GlobalValue *GV);
   static RISCVConstantPoolValue *Create(LLVMContext &C, StringRef S);
diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
index 4fa93f1..f9be80f 100644
--- a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
@@ -31,7 +31,7 @@ struct RISCVMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   RISCVMachineFunctionInfo(const llvm::RISCVMachineFunctionInfo &MFI);
 
   void mappingImpl(yaml::IO &YamlIO) override;
-  ~RISCVMachineFunctionInfo() = default;
+  ~RISCVMachineFunctionInfo() override = default;
 };
 
 template <> struct MappingTraits<RISCVMachineFunctionInfo> {
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
index 28a1690..6e444c9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
@@ -347,7 +347,7 @@ class SPIRVLegalizePointerCast : public FunctionPass {
 public:
   SPIRVLegalizePointerCast(SPIRVTargetMachine *TM) : FunctionPass(ID), TM(TM) {}
 
-  virtual bool runOnFunction(Function &F) override {
+  bool runOnFunction(Function &F) override {
     const SPIRVSubtarget &ST = TM->getSubtarget<SPIRVSubtarget>(F);
     GR = ST.getSPIRVGlobalRegistry();
     DeadInstructions.clear();
diff --git a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
index 60d39c9..aba9cf7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
@@ -234,7 +234,7 @@ public:
   }
 #endif
 
-  virtual bool runOnFunction(Function &F) override {
+  bool runOnFunction(Function &F) override {
     LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     auto *TopLevelRegion =
         getAnalysis<SPIRVConvergenceRegionAnalysisWrapperPass>()
diff --git a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
index e621bcd44..b1a8d1a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
@@ -28,7 +28,7 @@ public:
 
   SPIRVStripConvergentIntrinsics() : FunctionPass(ID) {}
 
-  virtual bool runOnFunction(Function &F) override {
+  bool runOnFunction(Function &F) override {
     DenseSet<Instruction *> ToRemove;
 
     // Is the instruction is a convergent intrinsic, add it to kill-list and
diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
index 5b149f8..ea634fb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
@@ -1113,7 +1113,7 @@ public:
 
   SPIRVStructurizer() : FunctionPass(ID) {}
 
-  virtual bool runOnFunction(Function &F) override {
+  bool runOnFunction(Function &F) override {
     bool Modified = false;
 
     // In LLVM, Switches are allowed to have several cases branching to the same
diff --git a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index e585e5a..b4dadaa 100644
--- a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -36,7 +36,7 @@ class SparcDisassembler : public MCDisassembler {
 public:
   SparcDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
       : MCDisassembler(STI, Ctx) {}
-  virtual ~SparcDisassembler() = default;
+  ~SparcDisassembler() override = default;
 
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
diff --git a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
index 1ff6cc8..ba325b5 100644
--- a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
+++ b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -111,7 +111,7 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy {
 
 public:
   SystemZPostRASchedStrategy(const MachineSchedContext *C);
-  virtual ~SystemZPostRASchedStrategy();
+  ~SystemZPostRASchedStrategy() override;
 
   /// Called for a region before scheduling.
   void initPolicy(MachineBasicBlock::iterator Begin,
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
index b1de145..bea8b9f 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -101,7 +101,7 @@ public:
   int getStackPointerBias() final { return 2048; }
 
   /// Destroys the object. Bogus destructor overriding base class destructor
-  ~SystemZXPLINK64Registers() = default;
+  ~SystemZXPLINK64Registers() override = default;
 };
 
 /// ELF calling convention specific use registers
@@ -124,7 +124,7 @@ public:
   int getStackPointerBias() final { return 0; }
 
   /// Destroys the object. Bogus destructor overriding base class destructor
-  ~SystemZELFRegisters() = default;
+  ~SystemZELFRegisters() override = default;
 };
 
 struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
diff --git a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
index aad826b..465e074 100644
--- a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
+++ b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
@@ -36,7 +36,7 @@ class VEDisassembler : public MCDisassembler {
 public:
   VEDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
       : MCDisassembler(STI, Ctx) {}
-  virtual ~VEDisassembler() = default;
+  ~VEDisassembler() override = default;
 
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 45bbf12..9175b27 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -102,7 +102,7 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
   WebAssemblyOperand(SMLoc Start, SMLoc End, CaLOp C)
       : Kind(CatchList), StartLoc(Start), EndLoc(End), CaL(C) {}
 
-  ~WebAssemblyOperand() {
+  ~WebAssemblyOperand() override {
     if (isBrList())
       BrL.~BrLOp();
     if (isCatchList())
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 40ae4ae..ff4d6469 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -192,7 +192,7 @@ struct WebAssemblyFunctionInfo final : public yaml::MachineFunctionInfo {
                           const llvm::WebAssemblyFunctionInfo &MFI);
 
   void mappingImpl(yaml::IO &YamlIO) override;
-  ~WebAssemblyFunctionInfo() = default;
+  ~WebAssemblyFunctionInfo() override = default;
 };
 
 template <> struct MappingTraits<WebAssemblyFunctionInfo> {
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index ac251fd..127ee67 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3533,10 +3533,10 @@ bool X86AsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name,
   while (isLockRepeatNtPrefix(Name.lower())) {
     unsigned Prefix =
         StringSwitch<unsigned>(Name)
-            .Cases("lock", "lock", X86::IP_HAS_LOCK)
-            .Cases("rep", "repe", "repz", X86::IP_HAS_REPEAT)
-            .Cases("repne", "repnz", X86::IP_HAS_REPEAT_NE)
-            .Cases("notrack", "notrack", X86::IP_HAS_NOTRACK)
+            .Case("lock", X86::IP_HAS_LOCK)
+            .Cases({"rep", "repe", "repz"}, X86::IP_HAS_REPEAT)
+            .Cases({"repne", "repnz"}, X86::IP_HAS_REPEAT_NE)
+            .Case("notrack", X86::IP_HAS_NOTRACK)
             .Default(X86::IP_NO_PREFIX); // Invalid prefix (impossible)
     Flags |= Prefix;
     if (getLexer().is(AsmToken::EndOfStatement)) {
diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
index d6197f3..05a1c22 100644
--- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
@@ -37,7 +37,7 @@ public:
   X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
       : InstrPostProcess(STI, MCII) {}
 
-  ~X86InstrPostProcess() = default;
+  ~X86InstrPostProcess() override = default;
 
   void postProcessInstruction(Instruction &Inst, const MCInst &MCI) override;
 };
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 1c06dc4..af5a698 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -491,7 +491,7 @@ namespace X86_MC {
 class X86MCInstrAnalysis : public MCInstrAnalysis {
   X86MCInstrAnalysis(const X86MCInstrAnalysis &) = delete;
   X86MCInstrAnalysis &operator=(const X86MCInstrAnalysis &) = delete;
-  virtual ~X86MCInstrAnalysis() = default;
+  ~X86MCInstrAnalysis() override = default;
 
 public:
   X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {}
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index 5f974e5..1bda505 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -43,7 +43,7 @@ struct X86MachineFunctionInfo final : public yaml::MachineFunctionInfo {
   X86MachineFunctionInfo(const llvm::X86MachineFunctionInfo &MFI);
 
   void mappingImpl(yaml::IO &YamlIO) override;
-  ~X86MachineFunctionInfo() = default;
+  ~X86MachineFunctionInfo() override = default;
 };
 
 template <> struct MappingTraits<X86MachineFunctionInfo> {
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 077d29f..3b59ebb 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -272,6 +272,9 @@ AA::getInitialValueForObj(Attributor &A, const AbstractAttribute &QueryingAA,
   }
 
   if (RangePtr && !RangePtr->offsetOrSizeAreUnknown()) {
+    int64_t StorageSize = DL.getTypeStoreSize(&Ty);
+    if (StorageSize != RangePtr->Size)
+      return nullptr;
     APInt Offset = APInt(64, RangePtr->Offset);
     return ConstantFoldLoadFromConst(Initializer, &Ty, Offset, DL);
   }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
index 423ce82..d5947c8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
@@ -90,9 +90,7 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
-    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY2]](s32)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = COPY $vgpr2_vgpr3
@@ -328,3 +326,30 @@ body: |
     $vgpr0_vgpr1 = COPY %4
 
 ...
+
+---
+name: shufflevector_cache_element
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: shufflevector_cache_element
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY2]](s32), [[COPY2]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY3]](s32), [[COPY3]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<8 x s32>), [[COPY1]](s32)
+    ; CHECK-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<2 x s32>) = G_BUILD_VECTOR %0, %1
+    %3:_(<8 x s32>) = G_SHUFFLE_VECTOR %2, %2, shufflemask(0, 0, 0, 0, 3, 3, 3, 3)
+    %4:_(s32) = G_EXTRACT_VECTOR_ELT %3(<8 x s32>), %1(s32)
+    $vgpr0 = COPY %4
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir
index 10f425c..dff8111 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir
@@ -237,16 +237,13 @@ body: |
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32)
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_0_0
     ; GFX9: liveins: $vgpr0, $vgpr1
@@ -257,10 +254,7 @@ body: |
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $vgpr1
@@ -285,12 +279,10 @@ body: |
     ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
-    ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_1_1
     ; GFX9: liveins: $vgpr0, $vgpr1
@@ -301,10 +293,7 @@ body: |
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $vgpr1
@@ -329,16 +318,13 @@ body: |
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32)
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_2_2
     ; GFX9: liveins: $vgpr0, $vgpr1
@@ -349,10 +335,7 @@ body: |
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $vgpr1
@@ -641,12 +624,10 @@ body: |
     ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
-    ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_3_3
     ; GFX9: liveins: $vgpr0, $vgpr1
@@ -657,10 +638,7 @@ body: |
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $vgpr1
@@ -733,16 +711,13 @@ body: |
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32)
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_2_0
     ; GFX9: liveins: $vgpr0, $vgpr1
@@ -753,10 +728,7 @@ body: |
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
index 4b6d14e..1d5c858 100644
--- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
+++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
@@ -53,14 +53,12 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) {
 ; BSWAP is not supported by 32 bit target
 ; CHECK-LABEL: load_i32_by_i8_bswap:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #255
 ; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    orr r1, r1, #16711680
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap:
@@ -223,21 +221,16 @@ define i32 @load_i32_by_i16_i8(ptr %arg) {
 define i64 @load_i64_by_i8_bswap(ptr %arg) {
 ; CHECK-LABEL: load_i64_by_i8_bswap:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r2, #255
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    mov r12, #65280
 ; CHECK-NEXT:    ldr r0, [r0, #4]
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r3, r12, r0, lsr #8
-; CHECK-NEXT:    orr r3, r3, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    and r2, r12, r1, lsr #8
-; CHECK-NEXT:    orr r0, r0, r3
-; CHECK-NEXT:    and r3, r1, #65280
-; CHECK-NEXT:    orr r2, r2, r1, lsr #24
-; CHECK-NEXT:    lsl r1, r1, #24
-; CHECK-NEXT:    orr r1, r1, r3, lsl #8
-; CHECK-NEXT:    orr r1, r1, r2
+; CHECK-NEXT:    orr r2, r2, #16711680
+; CHECK-NEXT:    and r3, r0, r2
+; CHECK-NEXT:    and r0, r2, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r3, ror #8
+; CHECK-NEXT:    and r3, r1, r2
+; CHECK-NEXT:    and r1, r2, r1, ror #24
+; CHECK-NEXT:    orr r1, r1, r3, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
@@ -377,14 +370,12 @@ define i64 @load_i64_by_i8(ptr %arg) {
 define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) {
 ; CHECK-LABEL: load_i32_by_i8_nonzero_offset:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #255
 ; CHECK-NEXT:    ldr r0, [r0, #1]
-; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    orr r1, r1, #16711680
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset:
@@ -434,14 +425,12 @@ define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) {
 define i32 @load_i32_by_i8_neg_offset(ptr %arg) {
 ; CHECK-LABEL: load_i32_by_i8_neg_offset:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #255
 ; CHECK-NEXT:    ldr r0, [r0, #-4]
-; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    orr r1, r1, #16711680
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset:
@@ -587,14 +576,12 @@ declare i16 @llvm.bswap.i16(i16)
 define i32 @load_i32_by_bswap_i16(ptr %arg) {
 ; CHECK-LABEL: load_i32_by_bswap_i16:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #255
 ; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    orr r1, r1, #16711680
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16:
@@ -667,14 +654,12 @@ define i32 @load_i32_by_i8_base_offset_index(ptr %arg, i32 %i) {
 ; CHECK-LABEL: load_i32_by_i8_base_offset_index:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    add r0, r0, r1
-; CHECK-NEXT:    mov r1, #65280
+; CHECK-NEXT:    mov r1, #255
+; CHECK-NEXT:    orr r1, r1, #16711680
 ; CHECK-NEXT:    ldr r0, [r0, #12]
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index:
@@ -733,14 +718,12 @@ define i32 @load_i32_by_i8_base_offset_index_2(ptr %arg, i32 %i) {
 ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    add r0, r1, r0
-; CHECK-NEXT:    mov r1, #65280
+; CHECK-NEXT:    mov r1, #255
+; CHECK-NEXT:    orr r1, r1, #16711680
 ; CHECK-NEXT:    ldr r0, [r0, #13]
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2:
diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll
index 0f6ec8a..7087367 100644
--- a/llvm/test/CodeGen/ARM/load-combine.ll
+++ b/llvm/test/CodeGen/ARM/load-combine.ll
@@ -117,14 +117,12 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) {
 ; BSWAP is not supported by 32 bit target
 ; CHECK-LABEL: load_i32_by_i8_bswap:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #255
 ; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    orr r1, r1, #16711680
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap:
@@ -237,21 +235,16 @@ define i64 @load_i64_by_i8(ptr %arg) {
 define i64 @load_i64_by_i8_bswap(ptr %arg) {
 ; CHECK-LABEL: load_i64_by_i8_bswap:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r2, #255
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    mov r12, #65280
 ; CHECK-NEXT:    ldr r0, [r0, #4]
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r3, r12, r0, lsr #8
-; CHECK-NEXT:    orr r3, r3, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    and r2, r12, r1, lsr #8
-; CHECK-NEXT:    orr r0, r0, r3
-; CHECK-NEXT:    and r3, r1, #65280
-; CHECK-NEXT:    orr r2, r2, r1, lsr #24
-; CHECK-NEXT:    lsl r1, r1, #24
-; CHECK-NEXT:    orr r1, r1, r3, lsl #8
-; CHECK-NEXT:    orr r1, r1, r2
+; CHECK-NEXT:    orr r2, r2, #16711680
+; CHECK-NEXT:    and r3, r0, r2
+; CHECK-NEXT:    and r0, r2, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r3, ror #8
+; CHECK-NEXT:    and r3, r1, r2
+; CHECK-NEXT:    and r1, r2, r1, ror #24
+; CHECK-NEXT:    orr r1, r1, r3, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
@@ -413,14 +406,12 @@ define i32 @load_i32_by_i8_neg_offset(ptr %arg) {
 define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) {
 ; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #255
 ; CHECK-NEXT:    ldr r0, [r0, #1]
-; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    orr r1, r1, #16711680
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap:
@@ -469,14 +460,12 @@ define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) {
 define i32 @load_i32_by_i8_neg_offset_bswap(ptr %arg) {
 ; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #255
 ; CHECK-NEXT:    ldr r0, [r0, #-4]
-; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    orr r1, r1, #16711680
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap:
@@ -527,14 +516,12 @@ declare i16 @llvm.bswap.i16(i16)
 define i32 @load_i32_by_bswap_i16(ptr %arg) {
 ; CHECK-LABEL: load_i32_by_bswap_i16:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #255
 ; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    orr r1, r1, #16711680
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16:
diff --git a/llvm/test/Transforms/Attributor/range-and-constant-fold.ll b/llvm/test/Transforms/Attributor/range-and-constant-fold.ll
new file mode 100644
index 0000000..a8f3309
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/range-and-constant-fold.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=attributor %s -o - | FileCheck %s
+
+@g = internal unnamed_addr addrspace(4) constant [3 x i8] c"12\00", align 16
+
+define void @foo(i32 %a, i32 %b, ptr %p.0, ptr %p.1) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], ptr nofree nonnull writeonly captures(none) dereferenceable(1) [[P_0:%.*]], ptr nofree nonnull writeonly align 4 captures(none) dereferenceable(8) [[P_1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[A]], [[B]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[L1:.*]], label %[[L2:.*]]
+; CHECK:       [[L1]]:
+; CHECK-NEXT:    br label %[[L3:.*]]
+; CHECK:       [[L2]]:
+; CHECK-NEXT:    br label %[[L3]]
+; CHECK:       [[L3]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi ptr addrspace(4) [ @g, %[[L1]] ], [ getelementptr inbounds nuw (i8, ptr addrspace(4) @g, i64 1), %[[L2]] ]
+; CHECK-NEXT:    [[LOAD_SMALL:%.*]] = load i8, ptr addrspace(4) [[PHI]], align 4
+; CHECK-NEXT:    store i8 [[LOAD_SMALL]], ptr [[P_0]], align 1
+; CHECK-NEXT:    [[LOAD_LARGE:%.*]] = load i64, ptr addrspace(4) [[PHI]], align 4
+; CHECK-NEXT:    store i64 [[LOAD_LARGE]], ptr [[P_1]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp ne i32 %a, %b
+  br i1 %cmp, label %l1, label %l2
+l1:
+  br label %l3
+l2:
+  br label %l3
+l3:
+  %phi = phi ptr addrspace(4) [ @g, %l1 ], [ getelementptr inbounds nuw (i8, ptr addrspace(4) @g, i64 1), %l2 ]
+  %load.small = load i8, ptr addrspace(4) %phi
+  store i8 %load.small, ptr %p.0
+  %load.large = load i64, ptr addrspace(4) %phi
+  store i64 %load.large, ptr %p.1
+  ret void
+}
diff --git a/llvm/tools/llvm-cov/llvm-cov.cpp b/llvm/tools/llvm-cov/llvm-cov.cpp
index 5ada5578..52610eff 100644
--- a/llvm/tools/llvm-cov/llvm-cov.cpp
+++ b/llvm/tools/llvm-cov/llvm-cov.cpp
@@ -71,8 +71,8 @@ int main(int argc, const char **argv) {
                             .Case("gcov", gcovMain)
                             .Case("report", reportMain)
                             .Case("show", showMain)
-                            .Cases("-h", "-help", "--help", helpMain)
-                            .Cases("-version", "--version", versionMain)
+                            .Cases({"-h", "-help", "--help"}, helpMain)
+                            .Cases({"-version", "--version"}, versionMain)
                             .Default(nullptr);
 
     if (Func) {
diff --git a/llvm/unittests/ADT/DenseMapTest.cpp b/llvm/unittests/ADT/DenseMapTest.cpp
index 50e9c6e..aceb4f3 100644
--- a/llvm/unittests/ADT/DenseMapTest.cpp
+++ b/llvm/unittests/ADT/DenseMapTest.cpp
@@ -70,6 +70,16 @@ public:
 
   int getValue() const { return Value; }
   bool operator==(const CtorTester &RHS) const { return Value == RHS.Value; }
+
+  // Return the number of live CtorTester objects, excluding the empty and
+  // tombstone keys.
+  static size_t getNumConstructed() {
+    return std::count_if(Constructed.begin(), Constructed.end(),
+                         [](const CtorTester *Obj) {
+                           int V = Obj->getValue();
+                           return V != -1 && V != -2;
+                         });
+  }
 };
 
 std::set<CtorTester *> CtorTester::Constructed;
@@ -1031,4 +1041,64 @@ TEST(SmallDenseMapCustomTest, InitSize) {
   }
 }
 
+TEST(DenseMapCustomTest, KeyDtor) {
+  // This test relies on CtorTester being non-trivially destructible.
+  static_assert(!std::is_trivially_destructible_v<CtorTester>,
+                "CtorTester must not be trivially destructible");
+
+  // Test that keys are destructed on scope exit.
+  EXPECT_EQ(0u, CtorTester::getNumConstructed());
+  {
+    DenseMap<CtorTester, int, CtorTesterMapInfo> Map;
+    Map.try_emplace(CtorTester(0), 1);
+    Map.try_emplace(CtorTester(1), 2);
+    EXPECT_EQ(2u, CtorTester::getNumConstructed());
+  }
+  EXPECT_EQ(0u, CtorTester::getNumConstructed());
+
+  // Test that keys are destructed on erase and shrink_and_clear.
+  EXPECT_EQ(0u, CtorTester::getNumConstructed());
+  {
+    DenseMap<CtorTester, int, CtorTesterMapInfo> Map;
+    Map.try_emplace(CtorTester(0), 1);
+    Map.try_emplace(CtorTester(1), 2);
+    EXPECT_EQ(2u, CtorTester::getNumConstructed());
+    Map.erase(CtorTester(1));
+    EXPECT_EQ(1u, CtorTester::getNumConstructed());
+    Map.shrink_and_clear();
+    EXPECT_EQ(0u, CtorTester::getNumConstructed());
+  }
+  EXPECT_EQ(0u, CtorTester::getNumConstructed());
+}
+
+TEST(DenseMapCustomTest, ValueDtor) {
+  // This test relies on CtorTester being non-trivially destructible.
+  static_assert(!std::is_trivially_destructible_v<CtorTester>,
+                "CtorTester must not be trivially destructible");
+
+  // Test that values are destructed on scope exit.
+  EXPECT_EQ(0u, CtorTester::getNumConstructed());
+  {
+    DenseMap<int, CtorTester> Map;
+    Map.try_emplace(0, CtorTester(1));
+    Map.try_emplace(1, CtorTester(2));
+    EXPECT_EQ(2u, CtorTester::getNumConstructed());
+  }
+  EXPECT_EQ(0u, CtorTester::getNumConstructed());
+
+  // Test that values are destructed on erase and shrink_and_clear.
+  EXPECT_EQ(0u, CtorTester::getNumConstructed());
+  {
+    DenseMap<int, CtorTester> Map;
+    Map.try_emplace(0, CtorTester(1));
+    Map.try_emplace(1, CtorTester(2));
+    EXPECT_EQ(2u, CtorTester::getNumConstructed());
+    Map.erase(1);
+    EXPECT_EQ(1u, CtorTester::getNumConstructed());
+    Map.shrink_and_clear();
+    EXPECT_EQ(0u, CtorTester::getNumConstructed());
+  }
+  EXPECT_EQ(0u, CtorTester::getNumConstructed());
+}
+
 } // namespace
diff --git a/llvm/unittests/ADT/StringSwitchTest.cpp b/llvm/unittests/ADT/StringSwitchTest.cpp
index d88a0ff..c94feb5 100644
--- a/llvm/unittests/ADT/StringSwitchTest.cpp
+++ b/llvm/unittests/ADT/StringSwitchTest.cpp
@@ -157,7 +157,7 @@ TEST(StringSwitchTest, Cases) {
 
   auto Translate = [](StringRef S) {
     return llvm::StringSwitch<OSType>(S)
-        .Cases(StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt",
+        .Cases({StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt"},
                OSType::Windows)
         .Cases({"linux", "unix", "*nix", "posix"}, OSType::Linux)
         .Cases({"macos", "osx"}, OSType::MacOS)
@@ -189,7 +189,7 @@ TEST(StringSwitchTest, CasesLower) {
 
   auto Translate = [](StringRef S) {
     return llvm::StringSwitch<OSType>(S)
-        .CasesLower(StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt",
+        .CasesLower({StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt"},
                     OSType::Windows)
         .CasesLower({"linux", "unix", "*nix", "posix"}, OSType::Linux)
         .CasesLower({"macos", "osx"}, OSType::MacOS)
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index b570f8d..151b065 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -71,6 +71,7 @@ CodeGen/Hexagon/autohvx/interleave.ll
 CodeGen/Hexagon/loop-idiom/hexagon-memmove1.ll
 CodeGen/Hexagon/loop-idiom/hexagon-memmove2.ll
 CodeGen/Hexagon/loop-idiom/memmove-rt-check.ll
+CodeGen/Hexagon/masked_gather.ll
 CodeGen/NVPTX/lower-ctor-dtor.ll
 CodeGen/RISCV/zmmul.ll
 CodeGen/WebAssembly/memory-interleave.ll
@@ -331,6 +332,10 @@ Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll
 Instrumentation/MemorySanitizer/AArch64/module-flags-aarch64.ll
 Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll
 Instrumentation/MemorySanitizer/AArch64/qshrn.ll
+Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
+Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
+Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll
+Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll
 Instrumentation/MemorySanitizer/AArch64/vararg.ll
 Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll
 Instrumentation/MemorySanitizer/abs-vector.ll
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 1437c39..37db096 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1111,9 +1111,9 @@ def AMDGPU_ScaledMFMAOp :
     AMDGPU_Op<"scaled_mfma", [AllTypesMatch<["destC", "destD"]>,
                         Pure]>,
     Arguments<(ins
-                   I32Attr:$m,
-                   I32Attr:$n,
-                   I32Attr:$k,
+                   ConfinedAttr<I32Attr, [IntIsOneOf<[16, 32]>]>:$m,
+                   ConfinedAttr<I32Attr, [IntIsOneOf<[16, 32]>]>:$n,
+                   ConfinedAttr<I32Attr, [IntIsOneOf<[64, 128]>]>:$k,
                    ScaledMFMAInTypes:$sourceA,
                    ScaledMFMAInTypes:$sourceB,
                    ScaledMFMAOutTypes:$destC,
@@ -1126,8 +1126,8 @@ def AMDGPU_ScaledMFMAOp :
   let summary = "MLIR wrapper for CDNA scaled mfma instructions";
   let description = [{
     The `amdgpu.scaled_mfma` op is an MLIR wrapper around intrinsics
-    for various scaled versions of `mfma` instructions in the CDNA architecture, which perform
-    multiple outer products in order to allow fast matrix multiplication.
+    for various scaled versions of `mfma` instructions in the CDNA architecture, which
+    perform multiple outer products in order to allow fast matrix multiplication.
 
     The wrapper will select an appropriate `mfma` instruction, if one is available,
     based on the provided `m`, `k`, `n`, and `nBlks` attributes, along with the
@@ -1142,15 +1142,23 @@ def AMDGPU_ScaledMFMAOp :
 
     This wrapper takes inspiration from `amdgpu.mfma`, but has some key differences:
     - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and
-    fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile
-    size.
+      fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as
+      their tile size.
     - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp`
-    are omitted from this wrapper.
-    - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for
-    double-precision operations on gfx94x and so are not included here.
+      are omitted from this wrapper.
+    - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported
+      for double-precision operations on gfx94x and so are not included here.
+
+    Example:
+    ```mlir
+      %0 = amdgpu.scaled_mfma 32x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2
+        : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32>
+    ```
   }];
   let assemblyFormat = [{
-    `(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC
+    custom<MNKDimensionList>($m, $n, $k) ` `
+    `(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*`
+    `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC
     attr-dict
     `:` type($scalesA) `,` type($sourceA) `,` type($scalesB) `,` type($sourceB) `,` type($destC)
   }];
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir b/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir
index 6de55d5..c746d76 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir
@@ -55,10 +55,10 @@ func.func @mfma_to_rocdl(%arg0 : vector<8xf16>, %arg1 : vector<16xf32>,
 // CHECK-LABEL: func @scaled_mfma_to_rocdl(
 // CHECK-SAME: %[[ARG0:.*]]: vector<16xf32>, %[[ARG1:.*]]: vector<4xf32>, %[[ARG2:.*]]: vector<32xf8E4M3FN>, %[[ARG3:.*]]: vector<32xf8E5M2>, %[[ARG4:.*]]: vector<32xf6E2M3FN>, %[[ARG5:.*]]: vector<32xf6E3M2FN>, %[[ARG6:.*]]: vector<32xf4E2M1FN>, %[[ARG7:.*]]: vector<4xf8E8M0FNU>, %[[ARG8:.*]]: f8E8M0FNU
 func.func @scaled_mfma_to_rocdl(%arg0 : vector<16xf32>,
-                    %arg1 : vector<4xf32>, %arg2 : vector<32xf8E4M3FN>,
-                    %arg3 : vector<32xf8E5M2>, %arg4 : vector<32xf6E2M3FN>,
-                    %arg5 : vector<32xf6E3M2FN>, %arg6 : vector<32xf4E2M1FN>,
-                    %arg7 : vector<4xf8E8M0FNU>, %arg8 : f8E8M0FNU) {
+                                %arg1 : vector<4xf32>,       %arg2 : vector<32xf8E4M3FN>,
+                                %arg3 : vector<32xf8E5M2>,   %arg4 : vector<32xf6E2M3FN>,
+                                %arg5 : vector<32xf6E3M2FN>, %arg6 : vector<32xf4E2M1FN>,
+                                %arg7 : vector<4xf8E8M0FNU>, %arg8 : f8E8M0FNU) {
 
   // CHECK: %[[c0:.+]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[c1:.+]] = llvm.mlir.constant(1 : i32) : i32
@@ -66,39 +66,39 @@ func.func @scaled_mfma_to_rocdl(%arg0 : vector<16xf32>,
   // CHECK: %[[z0:.+]] = llvm.zext {{.*}} : i8 to i32
 
   // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<16xf32>
+  amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<16xf32>
   // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<4xf32>
+  amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<4xf32>
 
   // CHECK: llvm.bitcast
 
   // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<16xf32>
+  amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<16xf32>
   // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<4xf32>
+  amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<4xf32>
 
   // CHECK: llvm.bitcast
 
   // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32>
+  amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32>
   // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<4xf32>
+  amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<4xf32>
 
   // CHECK: llvm.bitcast
   // CHECK: llvm.mlir.constant(3 : i32) : i32
 
   // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<16xf32>
+  amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<16xf32>
   // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<4xf32>
+  amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<4xf32>
 
   // CHECK: llvm.bitcast
   // CHECK: llvm.mlir.constant(4 : i32) : i32
 
   // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<4xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<16xf32>
+  amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<16xf32>
   // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<4xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<4xf32>
+  amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<4xf32>
 
   func.return
 }
diff --git a/mlir/test/Dialect/AMDGPU/canonicalize.mlir b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
index 52d3275..fee0c00 100644
--- a/mlir/test/Dialect/AMDGPU/canonicalize.mlir
+++ b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
@@ -165,10 +165,10 @@ func.func @fold_gather_to_lds_of_cast_dest(%global: memref<128x72xf32, 1>, %lds:
 // CHECK-LABEL: func @scaled_mfma
 // CHECK: %[[SCALE_1:.*]] = vector.extract_strided_slice %0 {offsets = [0], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU>
 // CHECK: %[[SCALE_2:.*]] = vector.extract_strided_slice %2 {offsets = [4], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU>
-// CHECK: amdgpu.scaled_mfma(%[[SCALE_1]][3] * %{{.*}}) * (%[[SCALE_2]][2] * %{{.*}}) {{.*}}
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%[[SCALE_1]][3] * %{{.*}}) * (%[[SCALE_2]][2] * %{{.*}}) {{.*}}
 // CHECK: %[[SCALE_3:.*]] = vector.extract_strided_slice %5 {offsets = [8], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU>
 // CHECK: %[[SCALE_4:.*]] = vector.extract_strided_slice %7 {offsets = [12], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU>
-// CHECK: amdgpu.scaled_mfma(%[[SCALE_3]][1] * %{{.*}}) * (%[[SCALE_4]][0] * %{{.*}}) {{.*}}
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%[[SCALE_3]][1] * %{{.*}}) * (%[[SCALE_4]][0] * %{{.*}}) {{.*}}
 func.func @scaled_mfma(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %scalesA: vector<2x1x8x1xf8E8M0FNU>, %scalesB: vector<2x1x8x1xf8E8M0FNU>) -> (vector<4xf32>, vector<4xf32>) {
   %cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32>
   %cst_1 = arith.constant dense<5.877470e-39> : vector<4xf8E8M0FNU>
@@ -176,12 +176,12 @@ func.func @scaled_mfma(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %sc
   %sA = vector.insert %scaleA, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
   %scaleB = vector.extract %scalesB[0, 0, 6, 0] : f8E8M0FNU from vector<2x1x8x1xf8E8M0FNU>
   %sB = vector.insert %scaleB, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
-  %res_0 = amdgpu.scaled_mfma(%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+  %res_0 = amdgpu.scaled_mfma 16x16x128 (%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
   %scaleC = vector.extract %scalesA[1, 0, 1, 0] : f8E8M0FNU from vector<2x1x8x1xf8E8M0FNU>
   %sC = vector.insert %scaleC, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
   %scaleD = vector.extract %scalesB[1, 0, 4, 0] : f8E8M0FNU from vector<2x1x8x1xf8E8M0FNU>
   %sD = vector.insert %scaleD, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
-  %res_1 = amdgpu.scaled_mfma(%sC[0] * %opA) * (%sD[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+  %res_1 = amdgpu.scaled_mfma 16x16x128 (%sC[0] * %opA) * (%sD[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
   return %res_0, %res_1 : vector<4xf32>, vector<4xf32>
 }
 
@@ -192,7 +192,7 @@ func.func @scaled_mfma(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %sc
 // CHECK: vector.insert {{.*}} : f8E8M0FNU into vector<4xf8E8M0FNU>
 // CHECK: vector.extract {{.*}} : f8E8M0FNU from vector<2xf8E8M0FNU>
 // CHECK: vector.insert {{.*}} : f8E8M0FNU into vector<4xf8E8M0FNU>
-// CHECK: amdgpu.scaled_mfma({{.*}}[0] * {{.*}}) * ({{.*}}[0] * {{.*}}
+// CHECK: amdgpu.scaled_mfma 16x16x128 ({{.*}}[0] * {{.*}}) * ({{.*}}[0] * {{.*}}
 func.func @scaled_mfma_less_than_4(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %scalesA: vector<2xf8E8M0FNU>, %scalesB: vector<2xf8E8M0FNU>) -> vector<4xf32> {
   %cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32>
   %cst_1 = arith.constant dense<5.877470e-39> : vector<4xf8E8M0FNU>
@@ -200,17 +200,17 @@ func.func @scaled_mfma_less_than_4(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4
   %sA = vector.insert %scaleA, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
   %scaleB = vector.extract %scalesB[1] : f8E8M0FNU from vector<2xf8E8M0FNU>
   %sB = vector.insert %scaleB, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
-  %res_0 = amdgpu.scaled_mfma(%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+  %res_0 = amdgpu.scaled_mfma 16x16x128 (%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
   return %res_0 : vector<4xf32>
 }
 
 // -----
 
 // CHECK-LABEL: func @scaled_mfma_ugly_shapes
-// CHECK: amdgpu.scaled_mfma(%{{.*}}[0] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
-// CHECK: amdgpu.scaled_mfma(%{{.*}}[1] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
-// CHECK: amdgpu.scaled_mfma(%{{.*}}[2] * %{{.*}}) * (%{{.*}}[2] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
-// CHECK: amdgpu.scaled_mfma(%{{.*}}[3] * %{{.*}}) * (%{{.*}}[1] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[0] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[1] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[2] * %{{.*}}) * (%{{.*}}[2] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[3] * %{{.*}}) * (%{{.*}}[1] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
 func.func @scaled_mfma_ugly_shapes(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %scalesA: vector<5x5xf8E8M0FNU>, %scalesB: vector<7x23xf8E8M0FNU>) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
   %cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32>
   %cst_1 = arith.constant dense<5.877470e-39> : vector<4xf8E8M0FNU>
@@ -237,10 +237,10 @@ func.func @scaled_mfma_ugly_shapes(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4
   %sB_6_21 = vector.insert %scaleB_6_21, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
   %sB_6_20 = vector.insert %scaleB_6_20, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
   %sB_6_19 = vector.insert %scaleB_6_19, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
-  
-  %res_4 = amdgpu.scaled_mfma(%sA_0_4[0] * %opA) * (%sB_6_22[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
-  %res_5 = amdgpu.scaled_mfma(%sA_0_5[0] * %opA) * (%sB_6_21[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
-  %res_6 = amdgpu.scaled_mfma(%sA_0_6[0] * %opA) * (%sB_6_20[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
-  %res_7 = amdgpu.scaled_mfma(%sA_0_7[0] * %opA) * (%sB_6_19[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+
+  %res_4 = amdgpu.scaled_mfma 16x16x128 (%sA_0_4[0] * %opA) * (%sB_6_22[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+  %res_5 = amdgpu.scaled_mfma 16x16x128 (%sA_0_5[0] * %opA) * (%sB_6_21[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+  %res_6 = amdgpu.scaled_mfma 16x16x128 (%sA_0_6[0] * %opA) * (%sB_6_20[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+  %res_7 = amdgpu.scaled_mfma 16x16x128 (%sA_0_7[0] * %opA) * (%sB_6_19[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
   return %res_4, %res_5, %res_6, %res_7 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
 }
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index b27dfdf..5784764 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -314,3 +314,27 @@ func.func @amdgpu.scaled_ext_packed816_invalid_input_output_sizes(%v: vector<8xf
   %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<16xf16>
   func.return
 }
+
+// -----
+
+func.func @scaled_mfma_invalid_m(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
+  // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'm' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}}
+  %0 = amdgpu.scaled_mfma 8x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32>
+  func.return %0 : vector<16xf32>
+}
+
+// -----
+
+func.func @scaled_mfma_invalid_n(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
+  // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'n' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}}
+  %0 = amdgpu.scaled_mfma 32x8x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32>
+  func.return %0 : vector<16xf32>
+}
+
+// -----
+
+func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
+  // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'k' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {64, 128}}}
+  %0 = amdgpu.scaled_mfma 32x32x32 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32>
+  func.return %0 : vector<16xf32>
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index d0bf0d7..a330967 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -609,8 +609,8 @@ func.func @permlane32_swap(%arg0 : f32) -> f32 {
 
 // CHECK-LABEL: func @scaled_mfma
 func.func @scaled_mfma(%arg0 : f8E8M0FNU, %arg1 : vector<32xf6E2M3FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
-  // CHECK: amdgpu.scaled_mfma
-  %0 = amdgpu.scaled_mfma(%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : f8E8M0FNU, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32>
+  // CHECK: amdgpu.scaled_mfma 32x32x64
+  %0 = amdgpu.scaled_mfma 32x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : f8E8M0FNU, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32>
   func.return %0 : vector<16xf32>
 }