aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bolt/lib/Passes/BinaryPasses.cpp5
-rw-r--r--clang/docs/LanguageExtensions.rst76
-rw-r--r--clang/lib/Basic/Targets/AArch64.cpp12
-rw-r--r--clang/lib/Basic/Targets/PPC.h30
-rw-r--r--clang/lib/Frontend/CompilerInvocation.cpp2
-rw-r--r--clang/lib/Lex/LiteralSupport.cpp8
-rw-r--r--llvm/docs/CIBestPractices.rst17
-rw-r--r--llvm/include/llvm/ADT/Bitfields.h4
-rw-r--r--llvm/include/llvm/ADT/ConcurrentHashtable.h11
-rw-r--r--llvm/include/llvm/ADT/DenseMap.h326
-rw-r--r--llvm/include/llvm/Support/Recycler.h21
-rw-r--r--llvm/include/llvm/Support/SpecialCaseList.h5
-rw-r--r--llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp12
-rw-r--r--llvm/lib/CodeGen/RegisterUsageInfo.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp6
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp12
-rw-r--r--llvm/lib/CodeGen/TargetOptionsImpl.cpp2
-rw-r--r--llvm/lib/MC/MCParser/MasmParser.cpp10
-rw-r--r--llvm/lib/Support/SpecialCaseList.cpp26
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h13
-rw-r--r--llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h4
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp2
-rw-r--r--llvm/lib/Target/ARM/ARMMachineFunctionInfo.h2
-rw-r--r--llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp2
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp2
-rw-r--r--llvm/lib/Target/BPF/BPFAsmPrinter.h2
-rw-r--r--llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp2
-rw-r--r--llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h2
-rw-r--r--llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp2
-rw-r--r--llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h4
-rw-r--r--llvm/lib/Target/RISCV/RISCVConstantPoolValue.h2
-rw-r--r--llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp2
-rw-r--r--llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp2
-rw-r--r--llvm/lib/Target/SystemZ/SystemZMachineScheduler.h2
-rw-r--r--llvm/lib/Target/SystemZ/SystemZRegisterInfo.h4
-rw-r--r--llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h2
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp8
-rw-r--r--llvm/lib/Target/X86/MCA/X86CustomBehaviour.h2
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/X86/X86MachineFunctionInfo.h2
-rw-r--r--llvm/lib/Transforms/IPO/Attributor.cpp3
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir31
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir68
-rw-r--r--llvm/test/CodeGen/ARM/load-combine-big-endian.ll93
-rw-r--r--llvm/test/CodeGen/ARM/load-combine.ll69
-rw-r--r--llvm/test/Transforms/Attributor/range-and-constant-fold.ll38
-rw-r--r--llvm/tools/llvm-cov/llvm-cov.cpp4
-rw-r--r--llvm/unittests/ADT/DenseMapTest.cpp70
-rw-r--r--llvm/unittests/ADT/StringSwitchTest.cpp4
-rw-r--r--llvm/utils/profcheck-xfail.txt5
-rw-r--r--mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td30
-rw-r--r--mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir28
-rw-r--r--mlir/test/Dialect/AMDGPU/canonicalize.mlir30
-rw-r--r--mlir/test/Dialect/AMDGPU/invalid.mlir24
-rw-r--r--mlir/test/Dialect/AMDGPU/ops.mlir4
67 files changed, 657 insertions, 524 deletions
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 06e840e..e1a1856 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -346,13 +346,16 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
uint64_t Bytes;
Function.markUnreachableBlocks();
LLVM_DEBUG({
+ bool HasInvalidBB = false;
for (BinaryBasicBlock &BB : Function) {
if (!BB.isValid()) {
+ HasInvalidBB = true;
dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
<< " in function " << Function << "\n";
- Function.dump();
}
}
+ if (HasInvalidBB)
+ Function.dump();
});
BinaryContext::IndependentCodeEmitter Emitter =
BC.createIndependentMCCodeEmitter();
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index bef6e9c1..495f2ab 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -32,7 +32,7 @@ Feature Checking Macros
=======================
Language extensions can be very useful, but only if you know you can depend on
-them. In order to allow fine-grain features checks, we support three builtin
+them. In order to allow fine-grained feature checks, we support three builtin
function-like macros. This allows you to directly test for a feature in your
code without having to resort to something like autoconf or fragile "compiler
version checks".
@@ -137,7 +137,7 @@ feature) or 0 if not. They can be used like this:
.. _langext-has-feature-back-compat:
For backward compatibility, ``__has_feature`` can also be used to test
-for support for non-standardized features, i.e. features not prefixed ``c_``,
+for support for non-standardized features, i.e., features not prefixed ``c_``,
``cxx_`` or ``objc_``.
Another use of ``__has_feature`` is to check for compiler features not related
@@ -159,7 +159,7 @@ of ``cxx_rvalue_references``.
This function-like macro is available in C++20 by default, and is provided as an
extension in earlier language standards. It takes a single argument that is the
-name of a double-square-bracket-style attribute. The argument can either be a
+name of a double-square-bracket-style attribute. The argument can be either a
single identifier or a scoped identifier. If the attribute is supported, a
nonzero value is returned. If the attribute is a standards-based attribute, this
macro returns a nonzero value based on the year and month in which the attribute
@@ -320,7 +320,7 @@ To test for this feature, use ``#if defined(__has_include)``:
.. code-block:: c++
- // To avoid problem with non-clang compilers not having this macro.
+ // To avoid problems with non-clang compilers not having this macro.
#if defined(__has_include)
#if __has_include("myinclude.h")
# include "myinclude.h"
@@ -345,7 +345,7 @@ or 0 otherwise:
# include_next "myinclude.h"
#endif
- // To avoid problem with non-clang compilers not having this macro.
+ // To avoid problems with non-clang compilers not having this macro.
#if defined(__has_include_next)
#if __has_include_next("myinclude.h")
# include_next "myinclude.h"
@@ -361,7 +361,7 @@ absolute path is used in the file argument.
-----------------
This function-like macro takes a string literal that represents a command line
-option for a warning and returns true if that is a valid warning option.
+option for a warning and returns ``true`` if that is a valid warning option.
.. code-block:: c++
@@ -396,7 +396,7 @@ Builtin Macros
file.
``__clang__``
- Defined when compiling with Clang
+ Defined when compiling with Clang.
``__clang_major__``
Defined to the major marketing version number of Clang (e.g., the 2 in
@@ -599,7 +599,7 @@ NEON vector types are created using ``neon_vector_type`` and
GCC vector types are created using the ``vector_size(N)`` attribute. The
argument ``N`` specifies the number of bytes that will be allocated for an
-object of this type. The size has to be multiple of the size of the vector
+object of this type. The size has to be a multiple of the size of the vector
element type. For example:
.. code-block:: c++
@@ -620,7 +620,7 @@ element type. For example:
Boolean Vectors
---------------
-Clang also supports the ext_vector_type attribute with boolean element types in
+Clang also supports the ``ext_vector_type`` attribute with boolean element types in
C and C++. For example:
.. code-block:: c++
@@ -674,7 +674,7 @@ Vector Literals
Vector literals can be used to create vectors from a set of scalars, or
vectors. Either parentheses or braces form can be used. In the parentheses
-form the number of literal values specified must be one, i.e. referring to a
+form the number of literal values specified must be one, i.e., referring to a
scalar value, or must match the size of the vector type being created. If a
single scalar literal value is specified, the scalar literal value will be
replicated to all the components of the vector type. In the brackets form any
@@ -725,12 +725,12 @@ address &v[i] no no no [#]_ no no
See also :ref:`langext-__builtin_shufflevector`, :ref:`langext-__builtin_convertvector`.
-.. [#] ternary operator(?:) has different behaviors depending on condition
- operand's vector type. If the condition is a GNU vector (i.e. __vector_size__),
+.. [#] ternary operator(?:) has different behaviors depending on the condition
+ operand's vector type. If the condition is a GNU vector (i.e., ``__vector_size__``),
a NEON vector or an SVE vector, it's only available in C++ and uses normal bool
conversions (that is, != 0).
If it's an extension (OpenCL) vector, it's only available in C and OpenCL C.
- And it selects based on signedness of the condition operands (OpenCL v1.1 s6.3.9).
+ And it selects based on the signedness of the condition operands (OpenCL v1.1 s6.3.9).
.. [#] sizeof can only be used on vector length specific SVE types.
.. [#] Clang does not allow the address of an element to be taken while GCC
allows this. This is intentional for vectors with a boolean element type and
@@ -747,7 +747,7 @@ to perform additional operations on certain scalar and vector types.
Let ``T`` be one of the following types:
* an integer type (as in C23 6.2.5p22), but excluding enumerated types and ``bool``
-* the standard floating types float or double
+* the standard floating types ``float`` or ``double``
* a half-precision floating point type, if one is supported on the target
* a vector type.
@@ -833,7 +833,7 @@ of different sizes and signs is forbidden in binary and ternary builtins.
T __builtin_elementwise_canonicalize(T x) return the platform specific canonical encoding floating point types
of a floating-point number
T __builtin_elementwise_copysign(T x, T y) return the magnitude of x with the sign of y. floating point types
- T __builtin_elementwise_fmod(T x, T y) return The floating-point remainder of (x/y) whose sign floating point types
+ T __builtin_elementwise_fmod(T x, T y) return the floating-point remainder of (x/y) whose sign floating point types
matches the sign of x.
T __builtin_elementwise_max(T x, T y) return x or y, whichever is larger integer and floating point types
For floating point types, follows semantics of maxNum
@@ -948,7 +948,7 @@ Let ``VT`` be a vector type and ``ET`` the element type of ``VT``.
Each builtin accesses memory according to a provided boolean mask. These are
provided as ``__builtin_masked_load`` and ``__builtin_masked_store``. The first
-argument is always boolean mask vector. The ``__builtin_masked_load`` builtin
+argument is always a boolean mask vector. The ``__builtin_masked_load`` builtin
takes an optional third vector argument that will be used for the result of the
masked-off lanes. These builtins assume the memory is unaligned, use
``__builtin_assume_aligned`` if alignment is desired.
@@ -1109,7 +1109,7 @@ to ``float``; see below for more information on this emulation.
``__fp16`` and ``_Float16`` both use the binary16 format from IEEE
754-2008, which provides a 5-bit exponent and an 11-bit significand
-(counting the implicit leading 1). ``__bf16`` uses the `bfloat16
+(including the implicit leading 1). ``__bf16`` uses the `bfloat16
<https://en.wikipedia.org/wiki/Bfloat16_floating-point_format>`_ format,
which provides an 8-bit exponent and an 8-bit significand; this is the same
exponent range as `float`, just with greatly reduced precision.
@@ -1143,7 +1143,7 @@ types with the ``-ffloat16-excess-precision=`` and
* ``none``: meaning to perform strict operation-by-operation emulation
* ``standard``: meaning that excess precision is permitted under the rules
- described in the standard, i.e. never across explicit casts or statements
+ described in the standard, i.e., never across explicit casts or statements
* ``fast``: meaning that excess precision is permitted whenever the
optimizer sees an opportunity to avoid truncations; currently this has no
effect beyond ``standard``
@@ -1899,7 +1899,7 @@ Type Trait Primitives
Type trait primitives are special builtin constant expressions that can be used
by the standard C++ library to facilitate or simplify the implementation of
-user-facing type traits in the <type_traits> header.
+user-facing type traits in the ``<type_traits>`` header.
They are not intended to be used directly by user code because they are
implementation-defined and subject to change -- as such they're tied closely to
@@ -2054,7 +2054,7 @@ The following type trait primitives are supported by Clang. Those traits marked
* ``__is_volatile`` (C++, Embarcadero)
* ``__reference_binds_to_temporary(T, U)`` (Clang): Determines whether a
reference of type ``T`` bound to an expression of type ``U`` would bind to a
- materialized temporary object. If ``T`` is not a reference type the result
+ materialized temporary object. If ``T`` is not a reference type, the result
is false. Note this trait will also return false when the initialization of
``T`` from ``U`` is ill-formed.
Deprecated, use ``__reference_constructs_from_temporary``.
@@ -2182,7 +2182,7 @@ Prior to clang-16, the output may only be used safely when the indirect
branches are not taken. Query for this difference with
``__has_extension(gnu_asm_goto_with_outputs_full)``.
-When using tied-outputs (i.e. outputs that are inputs and outputs, not just
+When using tied-outputs (i.e., outputs that are inputs and outputs, not just
outputs) with the `+r` constraint, there is a hidden input that's created
before the label, so numeric references to operands must account for that.
@@ -2361,7 +2361,7 @@ Loading from a ``__weak`` variable always implicitly retains the
loaded value. In non-ARC modes, this retain is normally balanced
by an implicit autorelease. This autorelease can be suppressed
by performing the load in the receiver position of a ``-retain``
-message send (e.g. ``[weakReference retain]``); note that this performs
+message send (e.g., ``[weakReference retain]``); note that this performs
only a single retain (the retain done when primitively loading from
the weak reference).
@@ -2562,7 +2562,7 @@ When a method that's introduced in the OS newer than the target OS is called, a
.. code-block:: objc
void my_fun(NSSomeClass* var) {
- // If fancyNewMethod was added in e.g. macOS 10.12, but the code is
+ // If fancyNewMethod was added in e.g., macOS 10.12, but the code is
// built with -mmacos-version-min=10.11, then this unconditional call
// will emit a -Wunguarded-availability warning:
[var fancyNewMethod];
@@ -2725,7 +2725,7 @@ correctly in any circumstances. It can be used if:
metaprogramming algorithms to be able to specify/detect types generically.
- the generated kernel binary does not contain indirect calls because they
- are eliminated using compiler optimizations e.g. devirtualization.
+ are eliminated using compiler optimizations e.g., devirtualization.
- the selected target supports the function pointer like functionality e.g.
most CPU targets.
@@ -2755,12 +2755,12 @@ Extension Specification, section 1.2
<https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_Ext.html#extensions-overview>`_.
This is not conformant behavior and it can only be used portably when the
-functions with variadic prototypes do not get generated in binary e.g. the
+functions with variadic prototypes do not get generated in binary e.g., the
variadic prototype is used to specify a function type with any number of
arguments in metaprogramming algorithms in C++ for OpenCL.
This extension can also be used when the kernel code is intended for targets
-supporting the variadic arguments e.g. majority of CPU targets.
+supporting the variadic arguments e.g., majority of CPU targets.
**Example of Use**:
@@ -5170,8 +5170,8 @@ __builtin_amdgcn_fence
``__builtin_amdgcn_fence`` emits a fence.
-* ``unsigned`` atomic ordering, e.g. ``__ATOMIC_ACQUIRE``
-* ``const char *`` synchronization scope, e.g. ``workgroup``
+* ``unsigned`` atomic ordering, e.g., ``__ATOMIC_ACQUIRE``
+* ``const char *`` synchronization scope, e.g., ``workgroup``
* Zero or more ``const char *`` address spaces names.
The address spaces arguments must be one of the following string literals:
@@ -5205,7 +5205,7 @@ boolean argument as a bit for every lane of the current wave that is currently
active (i.e., that is converged with the executing thread), and a 0 bit for
every lane that is not active.
-The result is uniform, i.e. it is the same in every active thread of the wave.
+The result is uniform, i.e., it is the same in every active thread of the wave.
__builtin_amdgcn_inverse_ballot_w{32,64}
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -5374,7 +5374,7 @@ followed by ``off`` or ``on``.
All function definitions in the region between an ``off`` and the following
``on`` will be decorated with the ``optnone`` attribute unless doing so would
-conflict with explicit attributes already present on the function (e.g. the
+conflict with explicit attributes already present on the function (e.g., the
ones that control inlining).
.. code-block:: c++
@@ -5472,12 +5472,12 @@ provides options for vectorization, interleaving, predication, unrolling and
distribution. Loop hints can be specified before any loop and will be ignored if
the optimization is not safe to apply.
-There are loop hints that control transformations (e.g. vectorization, loop
+There are loop hints that control transformations (e.g., vectorization, loop
unrolling) and there are loop hints that set transformation options (e.g.
``vectorize_width``, ``unroll_count``). Pragmas setting transformation options
-imply the transformation is enabled, as if it was enabled via the corresponding
-transformation pragma (e.g. ``vectorize(enable)``). If the transformation is
-disabled (e.g. ``vectorize(disable)``), that takes precedence over
+imply the transformation is enabled, as if it were enabled via the corresponding
+transformation pragma (e.g., ``vectorize(enable)``). If the transformation is
+disabled (e.g., ``vectorize(disable)``), that takes precedence over
transformations option pragmas implying that transformation.
Vectorization, Interleaving, and Predication
@@ -5704,7 +5704,7 @@ operation when supported by the target.
The pragma can take three values: ``on``, ``fast`` and ``off``. The ``on``
option is identical to using ``#pragma STDC FP_CONTRACT(ON)`` and it allows
fusion as specified the language standard. The ``fast`` option allows fusion
-in cases when the language standard does not make this possible (e.g. across
+in cases when the language standard does not make this possible (e.g., across
statements in C).
.. code-block:: c++
@@ -5859,14 +5859,14 @@ and ``ignore_denormal_mode``, each optionally prefixed with ``no_``. The meaning
are as follows:
- ``remote_memory`` means atomic operations may be performed on remote
- memory, i.e. memory accessed through off-chip interconnects (e.g., PCIe).
+ memory, i.e., memory accessed through off-chip interconnects (e.g., PCIe).
On ROCm platforms using HIP, remote memory refers to memory accessed via
PCIe and is subject to specific atomic operation support. See
`ROCm PCIe Atomics <https://rocm.docs.amd.com/en/latest/conceptual/
pcie-atomics.html>`_ for further details. Prefixing with ``no_remote_memory`` indicates that
atomic operations should not be performed on remote memory.
- ``fine_grained_memory`` means atomic operations may be performed on fine-grained
- memory, i.e. memory regions that support fine-grained coherence, where updates to
+ memory, i.e., memory regions that support fine-grained coherence, where updates to
memory are visible to other parts of the system even while modifications are ongoing.
For example, in HIP, fine-grained coherence ensures that host and device share
up-to-date data without explicit synchronization (see
@@ -6088,7 +6088,7 @@ match rules are specified after the attribute. The compiler expects an
identifier that corresponds to the subject set specifier. The ``apply_to``
specifier is currently the only supported subject set specifier. It allows you
to specify match rules that form a subset of the attribute's allowed subject
-set, i.e. the compiler doesn't require all of the attribute's subjects. For
+set, i.e., the compiler doesn't require all of the attribute's subjects. For
example, an attribute like ``[[nodiscard]]`` whose subject set includes
``enum``, ``record`` and ``hasType(functionType)``, requires the presence of at
least one of these rules after ``apply_to``:
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index c2d1bc1..a97e934 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -811,10 +811,10 @@ bool AArch64TargetInfo::validateCpuSupports(StringRef FeatureStr) const {
bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
return llvm::StringSwitch<bool>(Feature)
- .Cases("aarch64", "arm64", "arm", true)
+ .Cases({"aarch64", "arm64", "arm"}, true)
.Case("fmv", HasFMV)
.Case("fp", FPU & FPUMode)
- .Cases("neon", "simd", FPU & NeonMode)
+ .Cases({"neon", "simd"}, FPU & NeonMode)
.Case("jscvt", HasJSCVT)
.Case("fcma", HasFCMA)
.Case("rng", HasRandGen)
@@ -829,8 +829,8 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
.Case("cssc", HasCSSC)
.Case("sha2", HasSHA2)
.Case("sha3", HasSHA3)
- .Cases("aes", "pmull", HasAES)
- .Cases("fp16", "fullfp16", HasFullFP16)
+ .Cases({"aes", "pmull"}, HasAES)
+ .Cases({"fp16", "fullfp16"}, HasFullFP16)
.Case("dit", HasDIT)
.Case("dpb", HasCCPP)
.Case("dpb2", HasCCDP)
@@ -859,9 +859,9 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
.Case("memtag", HasMTE)
.Case("sb", HasSB)
.Case("predres", HasPredRes)
- .Cases("ssbs", "ssbs2", HasSSBS)
+ .Cases({"ssbs", "ssbs2"}, HasSSBS)
.Case("bti", HasBTI)
- .Cases("ls64", "ls64_v", "ls64_accdata", HasLS64)
+ .Cases({"ls64", "ls64_v", "ls64_accdata"}, HasLS64)
.Case("wfxt", HasWFxT)
.Case("rcpc3", HasRCPC3)
.Case("fp8", HasFP8)
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index 9f3a4cd..846b240 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -122,41 +122,41 @@ public:
.Case("970", ArchDefineName | ArchDefinePwr4 | ArchDefinePpcgr |
ArchDefinePpcsq)
.Case("a2", ArchDefineA2)
- .Cases("power3", "pwr3", ArchDefinePpcgr)
- .Cases("power4", "pwr4",
+ .Cases({"power3", "pwr3"}, ArchDefinePpcgr)
+ .Cases({"power4", "pwr4"},
ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
.Cases("power5", "pwr5",
ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
ArchDefinePpcsq)
- .Cases("power5x", "pwr5x",
+ .Cases({"power5x", "pwr5x"},
ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
ArchDefinePpcgr | ArchDefinePpcsq)
- .Cases("power6", "pwr6",
- ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
- ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
- .Cases("power6x", "pwr6x",
+ .Cases({"power6", "pwr6"}, ArchDefinePwr6 | ArchDefinePwr5x |
+ ArchDefinePwr5 | ArchDefinePwr4 |
+ ArchDefinePpcgr | ArchDefinePpcsq)
+ .Cases({"power6x", "pwr6x"},
ArchDefinePwr6x | ArchDefinePwr6 | ArchDefinePwr5x |
ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
ArchDefinePpcsq)
- .Cases("power7", "pwr7",
- ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x |
- ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
- ArchDefinePpcsq)
+ .Cases({"power7", "pwr7"}, ArchDefinePwr7 | ArchDefinePwr6 |
+ ArchDefinePwr5x | ArchDefinePwr5 |
+ ArchDefinePwr4 | ArchDefinePpcgr |
+ ArchDefinePpcsq)
// powerpc64le automatically defaults to at least power8.
- .Cases("power8", "pwr8", "ppc64le",
+ .Cases({"power8", "pwr8", "ppc64le"},
ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6 |
ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
ArchDefinePpcgr | ArchDefinePpcsq)
- .Cases("power9", "pwr9",
+ .Cases({"power9", "pwr9"},
ArchDefinePwr9 | ArchDefinePwr8 | ArchDefinePwr7 |
ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
- .Cases("power10", "pwr10",
+ .Cases({"power10", "pwr10"},
ArchDefinePwr10 | ArchDefinePwr9 | ArchDefinePwr8 |
ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x |
ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
ArchDefinePpcsq)
- .Cases("power11", "pwr11",
+ .Cases({"power11", "pwr11"},
ArchDefinePwr11 | ArchDefinePwr10 | ArchDefinePwr9 |
ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6 |
ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index d2cb751..bd36eb4 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3214,7 +3214,7 @@ static bool ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
DashX = llvm::StringSwitch<InputKind>(XValue)
.Case("cpp-output", InputKind(Language::C).getPreprocessed())
.Case("assembler-with-cpp", Language::Asm)
- .Cases("ast", "pcm", "precompiled-header",
+ .Cases({"ast", "pcm", "precompiled-header"},
InputKind(Language::Unknown, InputKind::Precompiled))
.Case("ir", Language::LLVM_IR)
.Case("cir", Language::CIR)
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 5b08d7f..1c06152 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1283,10 +1283,10 @@ bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
// Per tweaked N3660, "il", "i", and "if" are also used in the library.
// In C++2a "d" and "y" are used in the library.
return llvm::StringSwitch<bool>(Suffix)
- .Cases("h", "min", "s", true)
- .Cases("ms", "us", "ns", true)
- .Cases("il", "i", "if", true)
- .Cases("d", "y", LangOpts.CPlusPlus20)
+ .Cases({"h", "min", "s"}, true)
+ .Cases({"ms", "us", "ns"}, true)
+ .Cases({"il", "i", "if"}, true)
+ .Cases({"d", "y"}, LangOpts.CPlusPlus20)
.Default(false);
}
diff --git a/llvm/docs/CIBestPractices.rst b/llvm/docs/CIBestPractices.rst
index da92ed3..855e2cc 100644
--- a/llvm/docs/CIBestPractices.rst
+++ b/llvm/docs/CIBestPractices.rst
@@ -136,3 +136,20 @@ branches as follows:
branches:
- main
- releases/*
+
+Container Best Practices
+========================
+
+This section contains best practices/guidelines when working with containers
+for LLVM infrastructure.
+
+Using Fully Qualified Container Names
+-------------------------------------
+
+When referencing container images from a registry, such as in Github Actions
+workflows, or in ``Dockerfile`` files used for building images, prefer fully
+qualified names (i.e., including the registry domain) over just the image.
+For example, prefer ``docker.io/ubuntu:24.04`` over ``ubuntu:24.04``. This
+ensures portability across systems where a different default registry might
+be specified and also prevents attackers from changing the default registry
+to pull in a malicious image instead of the intended one.
diff --git a/llvm/include/llvm/ADT/Bitfields.h b/llvm/include/llvm/ADT/Bitfields.h
index 1fbc41c..be9546e 100644
--- a/llvm/include/llvm/ADT/Bitfields.h
+++ b/llvm/include/llvm/ADT/Bitfields.h
@@ -100,8 +100,8 @@ template <typename Bitfield, typename StorageType> struct Impl {
using IntegerType = typename Bitfield::IntegerType;
static constexpr size_t StorageBits = sizeof(StorageType) * CHAR_BIT;
- static_assert(Bitfield::FirstBit <= StorageBits, "Data must fit in mask");
- static_assert(Bitfield::LastBit <= StorageBits, "Data must fit in mask");
+ static_assert(Bitfield::FirstBit < StorageBits, "Data must fit in mask");
+ static_assert(Bitfield::LastBit < StorageBits, "Data must fit in mask");
static constexpr StorageType LowMask =
maskTrailingOnes<StorageType>(Bitfield::Bits);
static constexpr StorageType Mask = LowMask << Bitfield::Shift;
diff --git a/llvm/include/llvm/ADT/ConcurrentHashtable.h b/llvm/include/llvm/ADT/ConcurrentHashtable.h
index 6a943c5..0cc03cf 100644
--- a/llvm/include/llvm/ADT/ConcurrentHashtable.h
+++ b/llvm/include/llvm/ADT/ConcurrentHashtable.h
@@ -177,7 +177,7 @@ public:
#if LLVM_ENABLE_THREADS
// Lock bucket.
- CurBucket.Guard.lock();
+ std::scoped_lock<std::mutex> Lock(CurBucket.Guard);
#endif
HashesPtr BucketHashes = CurBucket.Hashes;
@@ -195,11 +195,6 @@ public:
CurBucket.NumberOfEntries++;
RehashBucket(CurBucket);
-
-#if LLVM_ENABLE_THREADS
- CurBucket.Guard.unlock();
-#endif
-
return {NewData, true};
}
@@ -208,10 +203,6 @@ public:
KeyDataTy *EntryData = BucketEntries[CurEntryIdx];
if (Info::isEqual(Info::getKey(*EntryData), NewValue)) {
// Already existed entry matched with inserted data is found.
-#if LLVM_ENABLE_THREADS
- CurBucket.Guard.unlock();
-#endif
-
return {EntryData, false};
}
}
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 25b5262..22ef7ed 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -130,13 +130,13 @@ public:
return;
}
- const KeyT EmptyKey = getEmptyKey();
+ const KeyT EmptyKey = KeyInfoT::getEmptyKey();
if constexpr (std::is_trivially_destructible_v<ValueT>) {
// Use a simpler loop when values don't need destruction.
for (BucketT &B : buckets())
B.getFirst() = EmptyKey;
} else {
- const KeyT TombstoneKey = getTombstoneKey();
+ const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
unsigned NumEntries = getNumEntries();
for (BucketT &B : buckets()) {
if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey)) {
@@ -154,6 +154,17 @@ public:
setNumTombstones(0);
}
+ void shrink_and_clear() {
+ auto [Reallocate, NewNumBuckets] = derived().planShrinkAndClear();
+ destroyAll();
+ if (!Reallocate) {
+ initEmpty();
+ return;
+ }
+ derived().deallocateBuckets();
+ derived().init(NewNumBuckets);
+ }
+
/// Return true if the specified key is in the map, false otherwise.
[[nodiscard]] bool contains(const_arg_type_t<KeyT> Val) const {
return doFind(Val) != nullptr;
@@ -314,7 +325,7 @@ public:
return false; // not in map.
TheBucket->getSecond().~ValueT();
- TheBucket->getFirst() = getTombstoneKey();
+ TheBucket->getFirst() = KeyInfoT::getTombstoneKey();
decrementNumEntries();
incrementNumTombstones();
return true;
@@ -322,7 +333,7 @@ public:
void erase(iterator I) {
BucketT *TheBucket = &*I;
TheBucket->getSecond().~ValueT();
- TheBucket->getFirst() = getTombstoneKey();
+ TheBucket->getFirst() = KeyInfoT::getTombstoneKey();
decrementNumEntries();
incrementNumTombstones();
}
@@ -353,10 +364,17 @@ protected:
DenseMapBase() = default;
void destroyAll() {
+ // No need to iterate through the buckets if both KeyT and ValueT are
+ // trivially destructible.
+ if constexpr (std::is_trivially_destructible_v<KeyT> &&
+ std::is_trivially_destructible_v<ValueT>)
+ return;
+
if (getNumBuckets() == 0) // Nothing to do.
return;
- const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
+ const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+ const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
for (BucketT &B : buckets()) {
if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) &&
!KeyInfoT::isEqual(B.getFirst(), TombstoneKey))
@@ -366,12 +384,14 @@ protected:
}
void initEmpty() {
+ static_assert(std::is_base_of_v<DenseMapBase, DerivedT>,
+ "Must pass the derived type to this template!");
setNumEntries(0);
setNumTombstones(0);
assert((getNumBuckets() & (getNumBuckets() - 1)) == 0 &&
"# initial buckets must be a power of two!");
- const KeyT EmptyKey = getEmptyKey();
+ const KeyT EmptyKey = KeyInfoT::getEmptyKey();
for (BucketT &B : buckets())
::new (&B.getFirst()) KeyT(EmptyKey);
}
@@ -391,8 +411,8 @@ protected:
initEmpty();
// Insert all the old elements.
- const KeyT EmptyKey = getEmptyKey();
- const KeyT TombstoneKey = getTombstoneKey();
+ const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+ const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
for (BucketT &B : OldBuckets) {
if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) &&
!KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) {
@@ -412,9 +432,16 @@ protected:
}
}
- template <typename OtherBaseT>
- void copyFrom(
- const DenseMapBase<OtherBaseT, KeyT, ValueT, KeyInfoT, BucketT> &other) {
+ void copyFrom(const DerivedT &other) {
+ this->destroyAll();
+ derived().deallocateBuckets();
+ setNumEntries(0);
+ setNumTombstones(0);
+ if (!derived().allocateBuckets(other.getNumBuckets())) {
+ // The bucket list is empty. No work to do.
+ return;
+ }
+
assert(&other != this);
assert(getNumBuckets() == other.getNumBuckets());
@@ -429,8 +456,8 @@ protected:
memcpy(reinterpret_cast<void *>(Buckets), OtherBuckets,
NumBuckets * sizeof(BucketT));
} else {
- const KeyT EmptyKey = getEmptyKey();
- const KeyT TombstoneKey = getTombstoneKey();
+ const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+ const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
for (size_t I = 0; I < NumBuckets; ++I) {
::new (&Buckets[I].getFirst()) KeyT(OtherBuckets[I].getFirst());
if (!KeyInfoT::isEqual(Buckets[I].getFirst(), EmptyKey) &&
@@ -440,19 +467,6 @@ protected:
}
}
- template <typename LookupKeyT>
- static unsigned getHashValue(const LookupKeyT &Val) {
- return KeyInfoT::getHashValue(Val);
- }
-
- static const KeyT getEmptyKey() {
- static_assert(std::is_base_of_v<DenseMapBase, DerivedT>,
- "Must pass the derived type to this template!");
- return KeyInfoT::getEmptyKey();
- }
-
- static const KeyT getTombstoneKey() { return KeyInfoT::getTombstoneKey(); }
-
private:
DerivedT &derived() { return *static_cast<DerivedT *>(this); }
const DerivedT &derived() const {
@@ -526,8 +540,6 @@ private:
void grow(unsigned AtLeast) { derived().grow(AtLeast); }
- void shrink_and_clear() { derived().shrink_and_clear(); }
-
template <typename LookupKeyT>
BucketT *findBucketForInsertion(const LookupKeyT &Lookup,
BucketT *TheBucket) {
@@ -560,7 +572,7 @@ private:
incrementNumEntries();
// If we are writing over a tombstone, remember this.
- const KeyT EmptyKey = getEmptyKey();
+ const KeyT EmptyKey = KeyInfoT::getEmptyKey();
if (!KeyInfoT::isEqual(TheBucket->getFirst(), EmptyKey))
decrementNumTombstones();
@@ -574,8 +586,8 @@ private:
if (NumBuckets == 0)
return nullptr;
- const KeyT EmptyKey = getEmptyKey();
- unsigned BucketNo = getHashValue(Val) & (NumBuckets - 1);
+ const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+ unsigned BucketNo = KeyInfoT::getHashValue(Val) & (NumBuckets - 1);
unsigned ProbeAmt = 1;
while (true) {
const BucketT *Bucket = BucketsPtr + BucketNo;
@@ -612,13 +624,13 @@ private:
// FoundTombstone - Keep track of whether we find a tombstone while probing.
BucketT *FoundTombstone = nullptr;
- const KeyT EmptyKey = getEmptyKey();
- const KeyT TombstoneKey = getTombstoneKey();
+ const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+ const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
assert(!KeyInfoT::isEqual(Val, EmptyKey) &&
!KeyInfoT::isEqual(Val, TombstoneKey) &&
"Empty/Tombstone value shouldn't be inserted into map!");
- unsigned BucketNo = getHashValue(Val) & (NumBuckets - 1);
+ unsigned BucketNo = KeyInfoT::getHashValue(Val) & (NumBuckets - 1);
unsigned ProbeAmt = 1;
while (true) {
BucketT *ThisBucket = BucketsPtr + BucketNo;
@@ -719,7 +731,7 @@ public:
DenseMap(const DenseMap &other) : BaseT() {
init(0);
- copyFrom(other);
+ this->copyFrom(other);
}
DenseMap(DenseMap &&other) : BaseT() {
@@ -755,7 +767,7 @@ public:
DenseMap &operator=(const DenseMap &other) {
if (&other != this)
- copyFrom(other);
+ this->copyFrom(other);
return *this;
}
@@ -767,56 +779,6 @@ public:
return *this;
}
- void copyFrom(const DenseMap &other) {
- this->destroyAll();
- deallocateBuckets();
- if (allocateBuckets(other.NumBuckets)) {
- this->BaseT::copyFrom(other);
- } else {
- NumEntries = 0;
- NumTombstones = 0;
- }
- }
-
- void grow(unsigned AtLeast) {
- unsigned OldNumBuckets = NumBuckets;
- BucketT *OldBuckets = Buckets;
-
- allocateBuckets(std::max<unsigned>(
- 64, static_cast<unsigned>(NextPowerOf2(AtLeast - 1))));
- assert(Buckets);
- if (!OldBuckets) {
- this->BaseT::initEmpty();
- return;
- }
-
- this->moveFromOldBuckets(
- llvm::make_range(OldBuckets, OldBuckets + OldNumBuckets));
-
- // Free the old table.
- deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets,
- alignof(BucketT));
- }
-
- void shrink_and_clear() {
- unsigned OldNumBuckets = NumBuckets;
- unsigned OldNumEntries = NumEntries;
- this->destroyAll();
-
- // Reduce the number of buckets.
- unsigned NewNumBuckets = 0;
- if (OldNumEntries)
- NewNumBuckets = std::max(64, 1 << (Log2_32_Ceil(OldNumEntries) + 1));
- if (NewNumBuckets == NumBuckets) {
- this->BaseT::initEmpty();
- return;
- }
-
- deallocate_buffer(Buckets, sizeof(BucketT) * OldNumBuckets,
- alignof(BucketT));
- init(NewNumBuckets);
- }
-
private:
unsigned getNumEntries() const { return NumEntries; }
@@ -855,6 +817,38 @@ private:
NumTombstones = 0;
}
}
+
+ void grow(unsigned AtLeast) {
+ unsigned OldNumBuckets = NumBuckets;
+ BucketT *OldBuckets = Buckets;
+
+ allocateBuckets(std::max<unsigned>(
+ 64, static_cast<unsigned>(NextPowerOf2(AtLeast - 1))));
+ assert(Buckets);
+ if (!OldBuckets) {
+ this->BaseT::initEmpty();
+ return;
+ }
+
+ this->moveFromOldBuckets(
+ llvm::make_range(OldBuckets, OldBuckets + OldNumBuckets));
+
+ // Free the old table.
+ deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets,
+ alignof(BucketT));
+ }
+
+ // Plan how to shrink the bucket table. Return:
+ // - {false, 0} to reuse the existing bucket table
+ // - {true, N} to reallocate a bucket table with N entries
+ std::pair<bool, unsigned> planShrinkAndClear() const {
+ unsigned NewNumBuckets = 0;
+ if (NumEntries)
+ NewNumBuckets = std::max(64u, 1u << (Log2_32_Ceil(NumEntries) + 1));
+ if (NewNumBuckets == NumBuckets)
+ return {false, 0}; // Reuse.
+ return {true, NewNumBuckets}; // Reallocate.
+ }
};
template <typename KeyT, typename ValueT, unsigned InlineBuckets = 4,
@@ -896,7 +890,7 @@ public:
SmallDenseMap(const SmallDenseMap &other) : BaseT() {
init(0);
- copyFrom(other);
+ this->copyFrom(other);
}
SmallDenseMap(SmallDenseMap &&other) : BaseT() {
@@ -928,8 +922,8 @@ public:
NumEntries = TmpNumEntries;
std::swap(NumTombstones, RHS.NumTombstones);
- const KeyT EmptyKey = this->getEmptyKey();
- const KeyT TombstoneKey = this->getTombstoneKey();
+ const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+ const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
if (Small && RHS.Small) {
// If we're swapping inline bucket arrays, we have to cope with some of
// the tricky bits of DenseMap's storage system: the buckets are not
@@ -995,7 +989,7 @@ public:
SmallDenseMap &operator=(const SmallDenseMap &other) {
if (&other != this)
- copyFrom(other);
+ this->copyFrom(other);
return *this;
}
@@ -1007,86 +1001,6 @@ public:
return *this;
}
- void copyFrom(const SmallDenseMap &other) {
- this->destroyAll();
- deallocateBuckets();
- allocateBuckets(other.getNumBuckets());
- this->BaseT::copyFrom(other);
- }
-
- void init(unsigned InitNumEntries) {
- auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries);
- allocateBuckets(InitBuckets);
- this->BaseT::initEmpty();
- }
-
- void grow(unsigned AtLeast) {
- if (AtLeast > InlineBuckets)
- AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast - 1));
-
- if (Small) {
- // First move the inline buckets into a temporary storage.
- AlignedCharArrayUnion<BucketT[InlineBuckets]> TmpStorage;
- BucketT *TmpBegin = reinterpret_cast<BucketT *>(&TmpStorage);
- BucketT *TmpEnd = TmpBegin;
-
- // Loop over the buckets, moving non-empty, non-tombstones into the
- // temporary storage. Have the loop move the TmpEnd forward as it goes.
- const KeyT EmptyKey = this->getEmptyKey();
- const KeyT TombstoneKey = this->getTombstoneKey();
- for (BucketT &B : inlineBuckets()) {
- if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) &&
- !KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) {
- assert(size_t(TmpEnd - TmpBegin) < InlineBuckets &&
- "Too many inline buckets!");
- ::new (&TmpEnd->getFirst()) KeyT(std::move(B.getFirst()));
- ::new (&TmpEnd->getSecond()) ValueT(std::move(B.getSecond()));
- ++TmpEnd;
- B.getSecond().~ValueT();
- }
- B.getFirst().~KeyT();
- }
-
- // AtLeast == InlineBuckets can happen if there are many tombstones,
- // and grow() is used to remove them. Usually we always switch to the
- // large rep here.
- allocateBuckets(AtLeast);
- this->moveFromOldBuckets(llvm::make_range(TmpBegin, TmpEnd));
- return;
- }
-
- LargeRep OldRep = std::move(*getLargeRep());
- getLargeRep()->~LargeRep();
- allocateBuckets(AtLeast);
-
- this->moveFromOldBuckets(OldRep.buckets());
-
- // Free the old table.
- deallocate_buffer(OldRep.Buckets, sizeof(BucketT) * OldRep.NumBuckets,
- alignof(BucketT));
- }
-
- void shrink_and_clear() {
- unsigned OldSize = this->size();
- this->destroyAll();
-
- // Reduce the number of buckets.
- unsigned NewNumBuckets = 0;
- if (OldSize) {
- NewNumBuckets = 1 << (Log2_32_Ceil(OldSize) + 1);
- if (NewNumBuckets > InlineBuckets && NewNumBuckets < 64u)
- NewNumBuckets = 64;
- }
- if ((Small && NewNumBuckets <= InlineBuckets) ||
- (!Small && NewNumBuckets == getLargeRep()->NumBuckets)) {
- this->BaseT::initEmpty();
- return;
- }
-
- deallocateBuckets();
- init(NewNumBuckets);
- }
-
private:
unsigned getNumEntries() const { return NumEntries; }
@@ -1152,7 +1066,7 @@ private:
getLargeRep()->~LargeRep();
}
- void allocateBuckets(unsigned Num) {
+ bool allocateBuckets(unsigned Num) {
if (Num <= InlineBuckets) {
Small = true;
} else {
@@ -1161,6 +1075,76 @@ private:
allocate_buffer(sizeof(BucketT) * Num, alignof(BucketT)));
new (getLargeRep()) LargeRep{NewBuckets, Num};
}
+ return true;
+ }
+
+ void init(unsigned InitNumEntries) {
+ auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries);
+ allocateBuckets(InitBuckets);
+ this->BaseT::initEmpty();
+ }
+
+ void grow(unsigned AtLeast) {
+ if (AtLeast > InlineBuckets)
+ AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast - 1));
+
+ if (Small) {
+ // First move the inline buckets into a temporary storage.
+ AlignedCharArrayUnion<BucketT[InlineBuckets]> TmpStorage;
+ BucketT *TmpBegin = reinterpret_cast<BucketT *>(&TmpStorage);
+ BucketT *TmpEnd = TmpBegin;
+
+ // Loop over the buckets, moving non-empty, non-tombstones into the
+ // temporary storage. Have the loop move the TmpEnd forward as it goes.
+ const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+ const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
+ for (BucketT &B : inlineBuckets()) {
+ if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) &&
+ !KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) {
+ assert(size_t(TmpEnd - TmpBegin) < InlineBuckets &&
+ "Too many inline buckets!");
+ ::new (&TmpEnd->getFirst()) KeyT(std::move(B.getFirst()));
+ ::new (&TmpEnd->getSecond()) ValueT(std::move(B.getSecond()));
+ ++TmpEnd;
+ B.getSecond().~ValueT();
+ }
+ B.getFirst().~KeyT();
+ }
+
+ // AtLeast == InlineBuckets can happen if there are many tombstones,
+ // and grow() is used to remove them. Usually we always switch to the
+ // large rep here.
+ allocateBuckets(AtLeast);
+ this->moveFromOldBuckets(llvm::make_range(TmpBegin, TmpEnd));
+ return;
+ }
+
+ LargeRep OldRep = std::move(*getLargeRep());
+ getLargeRep()->~LargeRep();
+ allocateBuckets(AtLeast);
+
+ this->moveFromOldBuckets(OldRep.buckets());
+
+ // Free the old table.
+ deallocate_buffer(OldRep.Buckets, sizeof(BucketT) * OldRep.NumBuckets,
+ alignof(BucketT));
+ }
+
+ // Plan how to shrink the bucket table. Return:
+ // - {false, 0} to reuse the existing bucket table
+ // - {true, N} to reallocate a bucket table with N entries
+ std::pair<bool, unsigned> planShrinkAndClear() const {
+ unsigned NewNumBuckets = 0;
+ if (!this->empty()) {
+ NewNumBuckets = 1u << (Log2_32_Ceil(this->size()) + 1);
+ if (NewNumBuckets > InlineBuckets)
+ NewNumBuckets = std::max(64u, NewNumBuckets);
+ }
+ bool Reuse = Small ? NewNumBuckets <= InlineBuckets
+ : NewNumBuckets == getLargeRep()->NumBuckets;
+ if (Reuse)
+ return {false, 0}; // Reuse.
+ return {true, NewNumBuckets}; // Reallocate.
}
};
diff --git a/llvm/include/llvm/Support/Recycler.h b/llvm/include/llvm/Support/Recycler.h
index b51c586..6502a70 100644
--- a/llvm/include/llvm/Support/Recycler.h
+++ b/llvm/include/llvm/Support/Recycler.h
@@ -19,6 +19,7 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include <cassert>
+#include <type_traits>
namespace llvm {
@@ -72,19 +73,19 @@ public:
/// deleted; calling clear is one way to ensure this.
template<class AllocatorType>
void clear(AllocatorType &Allocator) {
- while (FreeList) {
- T *t = reinterpret_cast<T *>(pop_val());
- Allocator.Deallocate(t, Size, Align);
+ if constexpr (std::is_same_v<std::decay_t<AllocatorType>,
+ BumpPtrAllocator>) {
+ // For BumpPtrAllocator, Deallocate is a no-op, so just drop the free
+ // list.
+ FreeList = nullptr;
+ } else {
+ while (FreeList) {
+ T *t = reinterpret_cast<T *>(pop_val());
+ Allocator.Deallocate(t, Size, Align);
+ }
}
}
- /// Special case for BumpPtrAllocator which has an empty Deallocate()
- /// function.
- ///
- /// There is no need to traverse the free list, pulling all the objects into
- /// cache.
- void clear(BumpPtrAllocator &) { FreeList = nullptr; }
-
template<class SubClass, class AllocatorType>
SubClass *Allocate(AllocatorType &Allocator) {
static_assert(alignof(SubClass) <= Align,
diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h
index a235975..860f73c 100644
--- a/llvm/include/llvm/Support/SpecialCaseList.h
+++ b/llvm/include/llvm/Support/SpecialCaseList.h
@@ -167,8 +167,9 @@ private:
std::vector<GlobMatcher::Glob> Globs;
RadixTree<iterator_range<StringRef::const_iterator>,
- SmallVector<const GlobMatcher::Glob *, 1>>
- PrefixToGlob;
+ RadixTree<iterator_range<StringRef::const_reverse_iterator>,
+ SmallVector<const GlobMatcher::Glob *, 1>>>
+ PrefixSuffixToGlob;
};
/// Represents a set of patterns and their line numbers
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 178529f..52c43a4 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -9084,6 +9084,8 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
SmallVector<Register, 32> BuildVec;
LLT EltTy = DstTy.getScalarType();
+ DenseMap<unsigned, Register> CachedExtract;
+
for (int Idx : Mask) {
if (Idx < 0) {
if (!Undef.isValid())
@@ -9097,9 +9099,13 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
int NumElts = Src0Ty.getNumElements();
Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
- auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
- auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
- BuildVec.push_back(Extract.getReg(0));
+ auto [It, Inserted] = CachedExtract.try_emplace(Idx);
+ if (Inserted) {
+ auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
+ It->second =
+ MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK).getReg(0);
+ }
+ BuildVec.push_back(It->second);
}
assert(DstTy.isVector() && "Unexpected scalar G_SHUFFLE_VECTOR");
diff --git a/llvm/lib/CodeGen/RegisterUsageInfo.cpp b/llvm/lib/CodeGen/RegisterUsageInfo.cpp
index 7a4628a..2ef380f 100644
--- a/llvm/lib/CodeGen/RegisterUsageInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterUsageInfo.cpp
@@ -44,7 +44,7 @@ void PhysicalRegisterUsageInfo::setTargetMachine(const TargetMachine &TM) {
}
bool PhysicalRegisterUsageInfo::doInitialization(Module &M) {
- RegMasks.grow(M.size());
+ RegMasks.reserve(M.size());
return false;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index dee0909..a522650 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2015,9 +2015,9 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
Register InReg = FuncInfo.InitializeRegForValue(Inst);
std::optional<CallingConv::ID> CallConv;
- auto *CI = dyn_cast<CallInst>(Inst);
- if (CI && !CI->isInlineAsm())
- CallConv = CI->getCallingConv();
+ auto *CB = dyn_cast<CallBase>(Inst);
+ if (CB && !CB->isInlineAsm())
+ CallConv = CB->getCallingConv();
RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg,
Inst->getType(), CallConv);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 920dff9..da4e409 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9899,6 +9899,18 @@ SDValue TargetLowering::expandBSWAP(SDNode *N, SelectionDAG &DAG) const {
// Use a rotate by 8. This can be further expanded if necessary.
return DAG.getNode(ISD::ROTL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
case MVT::i32:
+ // This is meant for ARM speficially, which has ROTR but no ROTL.
+ if (isOperationLegalOrCustom(ISD::ROTR, VT)) {
+ SDValue Mask = DAG.getConstant(0x00FF00FF, dl, VT);
+ // (x & 0x00FF00FF) rotr 8 | (x rotl 8) & 0x00FF00FF
+ SDValue And = DAG.getNode(ISD::AND, dl, VT, Op, Mask);
+ SDValue Rotr =
+ DAG.getNode(ISD::ROTR, dl, VT, And, DAG.getConstant(8, dl, SHVT));
+ SDValue Rotl =
+ DAG.getNode(ISD::ROTR, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
+ SDValue And2 = DAG.getNode(ISD::AND, dl, VT, Rotl, Mask);
+ return DAG.getNode(ISD::OR, dl, VT, Rotr, And2);
+ }
Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
Tmp3 = DAG.getNode(ISD::AND, dl, VT, Op,
DAG.getConstant(0xFF00, dl, VT));
diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
index 049efe8..c33bf8b 100644
--- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
@@ -44,7 +44,7 @@ bool TargetOptions::FramePointerIsReserved(const MachineFunction &MF) const {
return false;
return StringSwitch<bool>(FPAttr.getValueAsString())
- .Cases("all", "non-leaf", "reserved", true)
+ .Cases({"all", "non-leaf", "reserved"}, true)
.Case("none", false);
}
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index d4901d9..8a8f111 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -5844,11 +5844,11 @@ bool MasmParser::lookUpField(const StructInfo &Structure, StringRef Member,
bool MasmParser::lookUpType(StringRef Name, AsmTypeInfo &Info) const {
unsigned Size = StringSwitch<unsigned>(Name)
- .CasesLower("byte", "db", "sbyte", 1)
- .CasesLower("word", "dw", "sword", 2)
- .CasesLower("dword", "dd", "sdword", 4)
- .CasesLower("fword", "df", 6)
- .CasesLower("qword", "dq", "sqword", 8)
+ .CasesLower({"byte", "db", "sbyte"}, 1)
+ .CasesLower({"word", "dw", "sword"}, 2)
+ .CasesLower({"dword", "dd", "sdword"}, 4)
+ .CasesLower({"fword", "df"}, 6)
+ .CasesLower({"qword", "dq", "sqword"}, 8)
.CaseLower("real4", 4)
.CaseLower("real8", 8)
.CaseLower("real10", 10)
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index c27f627..3a97185 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -92,8 +92,10 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) {
for (const auto &G : reverse(Globs)) {
StringRef Prefix = G.Pattern.prefix();
+ StringRef Suffix = G.Pattern.suffix();
- auto &V = PrefixToGlob.emplace(Prefix).first->second;
+ auto &SToGlob = PrefixSuffixToGlob.emplace(Prefix).first->second;
+ auto &V = SToGlob.emplace(reverse(Suffix)).first->second;
V.emplace_back(&G);
}
}
@@ -101,16 +103,18 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) {
void SpecialCaseList::GlobMatcher::match(
StringRef Query,
llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
- if (!PrefixToGlob.empty()) {
- for (const auto &[_, V] : PrefixToGlob.find_prefixes(Query)) {
- for (const auto *G : V) {
- if (G->Pattern.match(Query)) {
- Cb(G->Name, G->LineNo);
- // As soon as we find a match in the vector, we can break for this
- // vector, since the globs are already sorted by priority within the
- // prefix group. However, we continue searching other prefix groups in
- // the map, as they may contain a better match overall.
- break;
+ if (!PrefixSuffixToGlob.empty()) {
+ for (const auto &[_, SToGlob] : PrefixSuffixToGlob.find_prefixes(Query)) {
+ for (const auto &[_, V] : SToGlob.find_prefixes(reverse(Query))) {
+ for (const auto *G : V) {
+ if (G->Pattern.match(Query)) {
+ Cb(G->Name, G->LineNo);
+ // As soon as we find a match in the vector, we can break for this
+ // vector, since the globs are already sorted by priority within the
+ // prefix group. However, we continue searching other prefix groups
+ // in the map, as they may contain a better match overall.
+ break;
+ }
}
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index d1832f4..f680a5e 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -651,7 +651,7 @@ struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo {
AArch64FunctionInfo(const llvm::AArch64FunctionInfo &MFI);
void mappingImpl(yaml::IO &YamlIO) override;
- ~AArch64FunctionInfo() = default;
+ ~AArch64FunctionInfo() override = default;
};
template <> struct MappingTraits<AArch64FunctionInfo> {
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 72a7676..47d76f3 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -154,7 +154,7 @@ public:
bool shouldAnalyzePhysregInMachineLoopInfo(MCRegister R) const override;
- virtual bool isIgnoredCVReg(MCRegister LLVMReg) const override;
+ bool isIgnoredCVReg(MCRegister LLVMReg) const override;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 1b4b113..6bad4dbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -131,7 +131,7 @@ protected:
public:
MetadataStreamerMsgPackV4() = default;
- ~MetadataStreamerMsgPackV4() = default;
+ ~MetadataStreamerMsgPackV4() override = default;
bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
@@ -154,7 +154,7 @@ protected:
public:
MetadataStreamerMsgPackV5() = default;
- ~MetadataStreamerMsgPackV5() = default;
+ ~MetadataStreamerMsgPackV5() override = default;
};
class MetadataStreamerMsgPackV6 final : public MetadataStreamerMsgPackV5 {
@@ -163,7 +163,7 @@ protected:
public:
MetadataStreamerMsgPackV6() = default;
- ~MetadataStreamerMsgPackV6() = default;
+ ~MetadataStreamerMsgPackV6() override = default;
void emitKernelAttrs(const AMDGPUTargetMachine &TM, const MachineFunction &MF,
msgpack::MapDocNode Kern) override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
index c5c9473..0804133 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
@@ -26,20 +26,19 @@ struct PerFunctionMIParsingState;
class AMDGPUMIRFormatter final : public MIRFormatter {
public:
AMDGPUMIRFormatter() = default;
- virtual ~AMDGPUMIRFormatter() = default;
+ ~AMDGPUMIRFormatter() override = default;
/// Implement target specific printing for machine operand immediate value, so
/// that we can have more meaningful mnemonic than a 64-bit integer. Passing
/// None to OpIdx means the index is unknown.
- virtual void printImm(raw_ostream &OS, const MachineInstr &MI,
- std::optional<unsigned> OpIdx,
- int64_t Imm) const override;
+ void printImm(raw_ostream &OS, const MachineInstr &MI,
+ std::optional<unsigned> OpIdx, int64_t Imm) const override;
/// Implement target specific parsing of immediate mnemonics. The mnemonic is
/// a string with a leading dot.
- virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
- StringRef Src, int64_t &Imm,
- ErrorCallbackType ErrorCallback) const override;
+ bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+ StringRef Src, int64_t &Imm,
+ ErrorCallbackType ErrorCallback) const override;
/// Implement target specific parsing of target custom pseudo source value.
bool
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
index cbc7427..4d0c163 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
@@ -32,7 +32,7 @@ public:
AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
: InstrPostProcess(STI, MCII) {}
- ~AMDGPUInstrPostProcess() = default;
+ ~AMDGPUInstrPostProcess() override = default;
void postProcessInstruction(Instruction &Inst, const MCInst &MCI) override;
};
@@ -88,7 +88,7 @@ public:
AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII);
- ~AMDGPUCustomBehaviour() = default;
+ ~AMDGPUCustomBehaviour() override = default;
/// This method is used to determine if an instruction
/// should be allowed to be dispatched. The return value is
/// how many cycles until the instruction can be dispatched.
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index 54fcd2a..246a3f8 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -64,7 +64,7 @@ private:
ArrayRef<const MCExpr *> Args;
AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
- ~AMDGPUMCExpr();
+ ~AMDGPUMCExpr() override;
bool evaluateExtraSGPRs(MCValue &Res, const MCAssembler *Asm) const;
bool evaluateTotalNumVGPR(MCValue &Res, const MCAssembler *Asm) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5fdedda..dc23a21 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1640,7 +1640,7 @@ public:
unsigned *PredCost = nullptr) const override;
InstructionUniformity
- getInstructionUniformity(const MachineInstr &MI) const override final;
+ getInstructionUniformity(const MachineInstr &MI) const final;
InstructionUniformity
getGenericInstructionUniformity(const MachineInstr &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 2c1a13c..019c3b7 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -311,7 +311,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
const llvm::MachineFunction &MF);
void mappingImpl(yaml::IO &YamlIO) override;
- ~SIMachineFunctionInfo() = default;
+ ~SIMachineFunctionInfo() override = default;
};
template <> struct MappingTraits<SIMachineFunctionInfo> {
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 07264d9..a177a42 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -640,7 +640,7 @@ public:
bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
- virtual bool handleCooperativeAtomic(MachineInstr &MI) const override;
+ bool handleCooperativeAtomic(MachineInstr &MI) const override;
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index a9c4b53..72eb3d0 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -312,7 +312,7 @@ struct ARMFunctionInfo final : public yaml::MachineFunctionInfo {
ARMFunctionInfo(const llvm::ARMFunctionInfo &MFI);
void mappingImpl(yaml::IO &YamlIO) override;
- ~ARMFunctionInfo() = default;
+ ~ARMFunctionInfo() override = default;
};
template <> struct MappingTraits<ARMFunctionInfo> {
diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
index 3a840a3..5548ad1 100644
--- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
+++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
@@ -40,7 +40,7 @@ class AVRDisassembler : public MCDisassembler {
public:
AVRDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
: MCDisassembler(STI, Ctx) {}
- virtual ~AVRDisassembler() = default;
+ ~AVRDisassembler() override = default;
DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
ArrayRef<uint8_t> Bytes, uint64_t Address,
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
index 619efb3..03c60e8 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
@@ -24,7 +24,7 @@ class AVRELFObjectWriter : public MCELFObjectTargetWriter {
public:
AVRELFObjectWriter(uint8_t OSABI);
- virtual ~AVRELFObjectWriter() = default;
+ ~AVRELFObjectWriter() override = default;
unsigned getRelocType(const MCFixup &, const MCValue &,
bool IsPCRel) const override;
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.h b/llvm/lib/Target/BPF/BPFAsmPrinter.h
index 0cfb283..90ef207 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.h
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.h
@@ -32,7 +32,7 @@ public:
void emitInstruction(const MachineInstr *MI) override;
MCSymbol *getJTPublicSymbol(unsigned JTI);
- virtual void emitJumpTableInfo() override;
+ void emitJumpTableInfo() override;
static char ID;
diff --git a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
index e3c39a1..b12985d 100644
--- a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
+++ b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
@@ -46,7 +46,7 @@ class BPFCheckAndAdjustIR final : public ModulePass {
public:
static char ID;
BPFCheckAndAdjustIR() : ModulePass(ID) {}
- virtual void getAnalysisUsage(AnalysisUsage &AU) const override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
private:
void checkIR(Module &M);
diff --git a/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h b/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h
index f3064c0c..af3542e 100644
--- a/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h
+++ b/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h
@@ -16,7 +16,7 @@ namespace llvm {
class BPFTargetLoweringObjectFileELF : public TargetLoweringObjectFileELF {
public:
- virtual MCSection *
+ MCSection *
getSectionForJumpTable(const Function &F, const TargetMachine &TM,
const MachineJumpTableEntry *JTE) const override;
};
diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
index 4aecaf1..8e791e6 100644
--- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -635,7 +635,7 @@ public:
B.setChangeObserver(*this);
}
- ~InstManager() { B.stopObservingChanges(); }
+ ~InstManager() override { B.stopObservingChanges(); }
void createdInstr(MachineInstr &MI) override { InstList.insert(&MI); }
void erasingInstr(MachineInstr &MI) override {}
diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h
index 34efa0b..4ccd3cf 100644
--- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h
+++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h
@@ -33,7 +33,7 @@ public:
explicit RISCVLMULInstrument(StringRef Data) : Instrument(DESC_NAME, Data) {}
- ~RISCVLMULInstrument() = default;
+ ~RISCVLMULInstrument() override = default;
uint8_t getLMUL() const;
};
@@ -45,7 +45,7 @@ public:
explicit RISCVSEWInstrument(StringRef Data) : Instrument(DESC_NAME, Data) {}
- ~RISCVSEWInstrument() = default;
+ ~RISCVSEWInstrument() override = default;
uint8_t getSEW() const;
};
diff --git a/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h b/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h
index b69904d..b2ce57a 100644
--- a/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h
+++ b/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h
@@ -37,7 +37,7 @@ private:
RISCVCPKind Kind;
public:
- ~RISCVConstantPoolValue() = default;
+ ~RISCVConstantPoolValue() override = default;
static RISCVConstantPoolValue *Create(const GlobalValue *GV);
static RISCVConstantPoolValue *Create(LLVMContext &C, StringRef S);
diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
index 4fa93f1..f9be80f 100644
--- a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
@@ -31,7 +31,7 @@ struct RISCVMachineFunctionInfo final : public yaml::MachineFunctionInfo {
RISCVMachineFunctionInfo(const llvm::RISCVMachineFunctionInfo &MFI);
void mappingImpl(yaml::IO &YamlIO) override;
- ~RISCVMachineFunctionInfo() = default;
+ ~RISCVMachineFunctionInfo() override = default;
};
template <> struct MappingTraits<RISCVMachineFunctionInfo> {
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
index 28a1690..6e444c9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
@@ -347,7 +347,7 @@ class SPIRVLegalizePointerCast : public FunctionPass {
public:
SPIRVLegalizePointerCast(SPIRVTargetMachine *TM) : FunctionPass(ID), TM(TM) {}
- virtual bool runOnFunction(Function &F) override {
+ bool runOnFunction(Function &F) override {
const SPIRVSubtarget &ST = TM->getSubtarget<SPIRVSubtarget>(F);
GR = ST.getSPIRVGlobalRegistry();
DeadInstructions.clear();
diff --git a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
index 60d39c9..aba9cf7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
@@ -234,7 +234,7 @@ public:
}
#endif
- virtual bool runOnFunction(Function &F) override {
+ bool runOnFunction(Function &F) override {
LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto *TopLevelRegion =
getAnalysis<SPIRVConvergenceRegionAnalysisWrapperPass>()
diff --git a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
index e621bcd44..b1a8d1a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
@@ -28,7 +28,7 @@ public:
SPIRVStripConvergentIntrinsics() : FunctionPass(ID) {}
- virtual bool runOnFunction(Function &F) override {
+ bool runOnFunction(Function &F) override {
DenseSet<Instruction *> ToRemove;
// Is the instruction is a convergent intrinsic, add it to kill-list and
diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
index 5b149f8..ea634fb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
@@ -1113,7 +1113,7 @@ public:
SPIRVStructurizer() : FunctionPass(ID) {}
- virtual bool runOnFunction(Function &F) override {
+ bool runOnFunction(Function &F) override {
bool Modified = false;
// In LLVM, Switches are allowed to have several cases branching to the same
diff --git a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index e585e5a..b4dadaa 100644
--- a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -36,7 +36,7 @@ class SparcDisassembler : public MCDisassembler {
public:
SparcDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
: MCDisassembler(STI, Ctx) {}
- virtual ~SparcDisassembler() = default;
+ ~SparcDisassembler() override = default;
DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
ArrayRef<uint8_t> Bytes, uint64_t Address,
diff --git a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
index 1ff6cc8..ba325b5 100644
--- a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
+++ b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -111,7 +111,7 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy {
public:
SystemZPostRASchedStrategy(const MachineSchedContext *C);
- virtual ~SystemZPostRASchedStrategy();
+ ~SystemZPostRASchedStrategy() override;
/// Called for a region before scheduling.
void initPolicy(MachineBasicBlock::iterator Begin,
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
index b1de145..bea8b9f 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -101,7 +101,7 @@ public:
int getStackPointerBias() final { return 2048; }
/// Destroys the object. Bogus destructor overriding base class destructor
- ~SystemZXPLINK64Registers() = default;
+ ~SystemZXPLINK64Registers() override = default;
};
/// ELF calling convention specific use registers
@@ -124,7 +124,7 @@ public:
int getStackPointerBias() final { return 0; }
/// Destroys the object. Bogus destructor overriding base class destructor
- ~SystemZELFRegisters() = default;
+ ~SystemZELFRegisters() override = default;
};
struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
diff --git a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
index aad826b..465e074 100644
--- a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
+++ b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
@@ -36,7 +36,7 @@ class VEDisassembler : public MCDisassembler {
public:
VEDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
: MCDisassembler(STI, Ctx) {}
- virtual ~VEDisassembler() = default;
+ ~VEDisassembler() override = default;
DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
ArrayRef<uint8_t> Bytes, uint64_t Address,
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 45bbf12..9175b27 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -102,7 +102,7 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
WebAssemblyOperand(SMLoc Start, SMLoc End, CaLOp C)
: Kind(CatchList), StartLoc(Start), EndLoc(End), CaL(C) {}
- ~WebAssemblyOperand() {
+ ~WebAssemblyOperand() override {
if (isBrList())
BrL.~BrLOp();
if (isCatchList())
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 40ae4ae..ff4d6469 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -192,7 +192,7 @@ struct WebAssemblyFunctionInfo final : public yaml::MachineFunctionInfo {
const llvm::WebAssemblyFunctionInfo &MFI);
void mappingImpl(yaml::IO &YamlIO) override;
- ~WebAssemblyFunctionInfo() = default;
+ ~WebAssemblyFunctionInfo() override = default;
};
template <> struct MappingTraits<WebAssemblyFunctionInfo> {
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index ac251fd..127ee67 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3533,10 +3533,10 @@ bool X86AsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name,
while (isLockRepeatNtPrefix(Name.lower())) {
unsigned Prefix =
StringSwitch<unsigned>(Name)
- .Cases("lock", "lock", X86::IP_HAS_LOCK)
- .Cases("rep", "repe", "repz", X86::IP_HAS_REPEAT)
- .Cases("repne", "repnz", X86::IP_HAS_REPEAT_NE)
- .Cases("notrack", "notrack", X86::IP_HAS_NOTRACK)
+ .Case("lock", X86::IP_HAS_LOCK)
+ .Cases({"rep", "repe", "repz"}, X86::IP_HAS_REPEAT)
+ .Cases({"repne", "repnz"}, X86::IP_HAS_REPEAT_NE)
+ .Case("notrack", X86::IP_HAS_NOTRACK)
.Default(X86::IP_NO_PREFIX); // Invalid prefix (impossible)
Flags |= Prefix;
if (getLexer().is(AsmToken::EndOfStatement)) {
diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
index d6197f3..05a1c22 100644
--- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
@@ -37,7 +37,7 @@ public:
X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
: InstrPostProcess(STI, MCII) {}
- ~X86InstrPostProcess() = default;
+ ~X86InstrPostProcess() override = default;
void postProcessInstruction(Instruction &Inst, const MCInst &MCI) override;
};
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 1c06dc4..af5a698 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -491,7 +491,7 @@ namespace X86_MC {
class X86MCInstrAnalysis : public MCInstrAnalysis {
X86MCInstrAnalysis(const X86MCInstrAnalysis &) = delete;
X86MCInstrAnalysis &operator=(const X86MCInstrAnalysis &) = delete;
- virtual ~X86MCInstrAnalysis() = default;
+ ~X86MCInstrAnalysis() override = default;
public:
X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {}
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index 5f974e5..1bda505 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -43,7 +43,7 @@ struct X86MachineFunctionInfo final : public yaml::MachineFunctionInfo {
X86MachineFunctionInfo(const llvm::X86MachineFunctionInfo &MFI);
void mappingImpl(yaml::IO &YamlIO) override;
- ~X86MachineFunctionInfo() = default;
+ ~X86MachineFunctionInfo() override = default;
};
template <> struct MappingTraits<X86MachineFunctionInfo> {
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 077d29f..3b59ebb 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -272,6 +272,9 @@ AA::getInitialValueForObj(Attributor &A, const AbstractAttribute &QueryingAA,
}
if (RangePtr && !RangePtr->offsetOrSizeAreUnknown()) {
+ int64_t StorageSize = DL.getTypeStoreSize(&Ty);
+ if (StorageSize != RangePtr->Size)
+ return nullptr;
APInt Offset = APInt(64, RangePtr->Offset);
return ConstantFoldLoadFromConst(Initializer, &Ty, Offset, DL);
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
index 423ce82..d5947c8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
@@ -90,9 +90,7 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
- ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY2]](s32)
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
@@ -328,3 +326,30 @@ body: |
$vgpr0_vgpr1 = COPY %4
...
+
+---
+name: shufflevector_cache_element
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: shufflevector_cache_element
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY2]](s32), [[COPY2]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY3]](s32), [[COPY3]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<8 x s32>), [[COPY1]](s32)
+ ; CHECK-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(<2 x s32>) = G_BUILD_VECTOR %0, %1
+ %3:_(<8 x s32>) = G_SHUFFLE_VECTOR %2, %2, shufflemask(0, 0, 0, 0, 3, 3, 3, 3)
+ %4:_(s32) = G_EXTRACT_VECTOR_ELT %3(<8 x s32>), %1(s32)
+ $vgpr0 = COPY %4
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir
index 10f425c..dff8111 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir
@@ -237,16 +237,13 @@ body: |
; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
- ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
- ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
+ ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32)
; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
- ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+ ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+ ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
;
; GFX9-LABEL: name: shufflevector_v2s16_v2s16_0_0
; GFX9: liveins: $vgpr0, $vgpr1
@@ -257,10 +254,7 @@ body: |
; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
- ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
- ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
- ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
@@ -285,12 +279,10 @@ body: |
; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
- ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
- ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
+ ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
- ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+ ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+ ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
;
; GFX9-LABEL: name: shufflevector_v2s16_v2s16_1_1
; GFX9: liveins: $vgpr0, $vgpr1
@@ -301,10 +293,7 @@ body: |
; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
- ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
- ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
- ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
@@ -329,16 +318,13 @@ body: |
; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
- ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
- ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
+ ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32)
; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
- ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+ ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+ ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
;
; GFX9-LABEL: name: shufflevector_v2s16_v2s16_2_2
; GFX9: liveins: $vgpr0, $vgpr1
@@ -349,10 +335,7 @@ body: |
; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
- ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
- ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
- ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
@@ -641,12 +624,10 @@ body: |
; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
- ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
- ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
+ ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
- ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+ ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+ ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
;
; GFX9-LABEL: name: shufflevector_v2s16_v2s16_3_3
; GFX9: liveins: $vgpr0, $vgpr1
@@ -657,10 +638,7 @@ body: |
; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
- ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
- ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
- ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
@@ -733,16 +711,13 @@ body: |
; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
- ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
- ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
+ ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32)
; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
- ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+ ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+ ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
;
; GFX9-LABEL: name: shufflevector_v2s16_v2s16_2_0
; GFX9: liveins: $vgpr0, $vgpr1
@@ -753,10 +728,7 @@ body: |
; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
- ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
- ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
- ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
index 4b6d14e..1d5c858 100644
--- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
+++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
@@ -53,14 +53,12 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) {
; BSWAP is not supported by 32 bit target
; CHECK-LABEL: load_i32_by_i8_bswap:
; CHECK: @ %bb.0:
+; CHECK-NEXT: mov r1, #255
; CHECK-NEXT: ldr r0, [r0]
-; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: orr r1, r1, #16711680
+; CHECK-NEXT: and r2, r0, r1
+; CHECK-NEXT: and r0, r1, r0, ror #24
+; CHECK-NEXT: orr r0, r0, r2, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap:
@@ -223,21 +221,16 @@ define i32 @load_i32_by_i16_i8(ptr %arg) {
define i64 @load_i64_by_i8_bswap(ptr %arg) {
; CHECK-LABEL: load_i64_by_i8_bswap:
; CHECK: @ %bb.0:
+; CHECK-NEXT: mov r2, #255
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: mov r12, #65280
; CHECK-NEXT: ldr r0, [r0, #4]
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r3, r12, r0, lsr #8
-; CHECK-NEXT: orr r3, r3, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: and r2, r12, r1, lsr #8
-; CHECK-NEXT: orr r0, r0, r3
-; CHECK-NEXT: and r3, r1, #65280
-; CHECK-NEXT: orr r2, r2, r1, lsr #24
-; CHECK-NEXT: lsl r1, r1, #24
-; CHECK-NEXT: orr r1, r1, r3, lsl #8
-; CHECK-NEXT: orr r1, r1, r2
+; CHECK-NEXT: orr r2, r2, #16711680
+; CHECK-NEXT: and r3, r0, r2
+; CHECK-NEXT: and r0, r2, r0, ror #24
+; CHECK-NEXT: orr r0, r0, r3, ror #8
+; CHECK-NEXT: and r3, r1, r2
+; CHECK-NEXT: and r1, r2, r1, ror #24
+; CHECK-NEXT: orr r1, r1, r3, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
@@ -377,14 +370,12 @@ define i64 @load_i64_by_i8(ptr %arg) {
define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) {
; CHECK-LABEL: load_i32_by_i8_nonzero_offset:
; CHECK: @ %bb.0:
+; CHECK-NEXT: mov r1, #255
; CHECK-NEXT: ldr r0, [r0, #1]
-; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: orr r1, r1, #16711680
+; CHECK-NEXT: and r2, r0, r1
+; CHECK-NEXT: and r0, r1, r0, ror #24
+; CHECK-NEXT: orr r0, r0, r2, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset:
@@ -434,14 +425,12 @@ define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) {
define i32 @load_i32_by_i8_neg_offset(ptr %arg) {
; CHECK-LABEL: load_i32_by_i8_neg_offset:
; CHECK: @ %bb.0:
+; CHECK-NEXT: mov r1, #255
; CHECK-NEXT: ldr r0, [r0, #-4]
-; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: orr r1, r1, #16711680
+; CHECK-NEXT: and r2, r0, r1
+; CHECK-NEXT: and r0, r1, r0, ror #24
+; CHECK-NEXT: orr r0, r0, r2, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset:
@@ -587,14 +576,12 @@ declare i16 @llvm.bswap.i16(i16)
define i32 @load_i32_by_bswap_i16(ptr %arg) {
; CHECK-LABEL: load_i32_by_bswap_i16:
; CHECK: @ %bb.0:
+; CHECK-NEXT: mov r1, #255
; CHECK-NEXT: ldr r0, [r0]
-; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: orr r1, r1, #16711680
+; CHECK-NEXT: and r2, r0, r1
+; CHECK-NEXT: and r0, r1, r0, ror #24
+; CHECK-NEXT: orr r0, r0, r2, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16:
@@ -667,14 +654,12 @@ define i32 @load_i32_by_i8_base_offset_index(ptr %arg, i32 %i) {
; CHECK-LABEL: load_i32_by_i8_base_offset_index:
; CHECK: @ %bb.0:
; CHECK-NEXT: add r0, r0, r1
-; CHECK-NEXT: mov r1, #65280
+; CHECK-NEXT: mov r1, #255
+; CHECK-NEXT: orr r1, r1, #16711680
; CHECK-NEXT: ldr r0, [r0, #12]
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: and r2, r0, r1
+; CHECK-NEXT: and r0, r1, r0, ror #24
+; CHECK-NEXT: orr r0, r0, r2, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index:
@@ -733,14 +718,12 @@ define i32 @load_i32_by_i8_base_offset_index_2(ptr %arg, i32 %i) {
; CHECK-LABEL: load_i32_by_i8_base_offset_index_2:
; CHECK: @ %bb.0:
; CHECK-NEXT: add r0, r1, r0
-; CHECK-NEXT: mov r1, #65280
+; CHECK-NEXT: mov r1, #255
+; CHECK-NEXT: orr r1, r1, #16711680
; CHECK-NEXT: ldr r0, [r0, #13]
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: and r2, r0, r1
+; CHECK-NEXT: and r0, r1, r0, ror #24
+; CHECK-NEXT: orr r0, r0, r2, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2:
diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll
index 0f6ec8a..7087367 100644
--- a/llvm/test/CodeGen/ARM/load-combine.ll
+++ b/llvm/test/CodeGen/ARM/load-combine.ll
@@ -117,14 +117,12 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) {
; BSWAP is not supported by 32 bit target
; CHECK-LABEL: load_i32_by_i8_bswap:
; CHECK: @ %bb.0:
+; CHECK-NEXT: mov r1, #255
; CHECK-NEXT: ldr r0, [r0]
-; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: orr r1, r1, #16711680
+; CHECK-NEXT: and r2, r0, r1
+; CHECK-NEXT: and r0, r1, r0, ror #24
+; CHECK-NEXT: orr r0, r0, r2, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap:
@@ -237,21 +235,16 @@ define i64 @load_i64_by_i8(ptr %arg) {
define i64 @load_i64_by_i8_bswap(ptr %arg) {
; CHECK-LABEL: load_i64_by_i8_bswap:
; CHECK: @ %bb.0:
+; CHECK-NEXT: mov r2, #255
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: mov r12, #65280
; CHECK-NEXT: ldr r0, [r0, #4]
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r3, r12, r0, lsr #8
-; CHECK-NEXT: orr r3, r3, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: and r2, r12, r1, lsr #8
-; CHECK-NEXT: orr r0, r0, r3
-; CHECK-NEXT: and r3, r1, #65280
-; CHECK-NEXT: orr r2, r2, r1, lsr #24
-; CHECK-NEXT: lsl r1, r1, #24
-; CHECK-NEXT: orr r1, r1, r3, lsl #8
-; CHECK-NEXT: orr r1, r1, r2
+; CHECK-NEXT: orr r2, r2, #16711680
+; CHECK-NEXT: and r3, r0, r2
+; CHECK-NEXT: and r0, r2, r0, ror #24
+; CHECK-NEXT: orr r0, r0, r3, ror #8
+; CHECK-NEXT: and r3, r1, r2
+; CHECK-NEXT: and r1, r2, r1, ror #24
+; CHECK-NEXT: orr r1, r1, r3, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
@@ -413,14 +406,12 @@ define i32 @load_i32_by_i8_neg_offset(ptr %arg) {
define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) {
; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap:
; CHECK: @ %bb.0:
+; CHECK-NEXT: mov r1, #255
; CHECK-NEXT: ldr r0, [r0, #1]
-; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: orr r1, r1, #16711680
+; CHECK-NEXT: and r2, r0, r1
+; CHECK-NEXT: and r0, r1, r0, ror #24
+; CHECK-NEXT: orr r0, r0, r2, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap:
@@ -469,14 +460,12 @@ define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) {
define i32 @load_i32_by_i8_neg_offset_bswap(ptr %arg) {
; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap:
; CHECK: @ %bb.0:
+; CHECK-NEXT: mov r1, #255
; CHECK-NEXT: ldr r0, [r0, #-4]
-; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: orr r1, r1, #16711680
+; CHECK-NEXT: and r2, r0, r1
+; CHECK-NEXT: and r0, r1, r0, ror #24
+; CHECK-NEXT: orr r0, r0, r2, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap:
@@ -527,14 +516,12 @@ declare i16 @llvm.bswap.i16(i16)
define i32 @load_i32_by_bswap_i16(ptr %arg) {
; CHECK-LABEL: load_i32_by_bswap_i16:
; CHECK: @ %bb.0:
+; CHECK-NEXT: mov r1, #255
; CHECK-NEXT: ldr r0, [r0]
-; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: orr r1, r1, #16711680
+; CHECK-NEXT: and r2, r0, r1
+; CHECK-NEXT: and r0, r1, r0, ror #24
+; CHECK-NEXT: orr r0, r0, r2, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16:
diff --git a/llvm/test/Transforms/Attributor/range-and-constant-fold.ll b/llvm/test/Transforms/Attributor/range-and-constant-fold.ll
new file mode 100644
index 0000000..a8f3309
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/range-and-constant-fold.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=attributor %s -o - | FileCheck %s
+
+@g = internal unnamed_addr addrspace(4) constant [3 x i8] c"12\00", align 16
+
+define void @foo(i32 %a, i32 %b, ptr %p.0, ptr %p.1) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], ptr nofree nonnull writeonly captures(none) dereferenceable(1) [[P_0:%.*]], ptr nofree nonnull writeonly align 4 captures(none) dereferenceable(8) [[P_1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[A]], [[B]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[L1:.*]], label %[[L2:.*]]
+; CHECK: [[L1]]:
+; CHECK-NEXT: br label %[[L3:.*]]
+; CHECK: [[L2]]:
+; CHECK-NEXT: br label %[[L3]]
+; CHECK: [[L3]]:
+; CHECK-NEXT: [[PHI:%.*]] = phi ptr addrspace(4) [ @g, %[[L1]] ], [ getelementptr inbounds nuw (i8, ptr addrspace(4) @g, i64 1), %[[L2]] ]
+; CHECK-NEXT: [[LOAD_SMALL:%.*]] = load i8, ptr addrspace(4) [[PHI]], align 4
+; CHECK-NEXT: store i8 [[LOAD_SMALL]], ptr [[P_0]], align 1
+; CHECK-NEXT: [[LOAD_LARGE:%.*]] = load i64, ptr addrspace(4) [[PHI]], align 4
+; CHECK-NEXT: store i64 [[LOAD_LARGE]], ptr [[P_1]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %cmp = icmp ne i32 %a, %b
+ br i1 %cmp, label %l1, label %l2
+l1:
+ br label %l3
+l2:
+ br label %l3
+l3:
+ %phi = phi ptr addrspace(4) [ @g, %l1 ], [ getelementptr inbounds nuw (i8, ptr addrspace(4) @g, i64 1), %l2 ]
+ %load.small = load i8, ptr addrspace(4) %phi
+ store i8 %load.small, ptr %p.0
+ %load.large = load i64, ptr addrspace(4) %phi
+ store i64 %load.large, ptr %p.1
+ ret void
+}
diff --git a/llvm/tools/llvm-cov/llvm-cov.cpp b/llvm/tools/llvm-cov/llvm-cov.cpp
index 5ada5578..52610eff 100644
--- a/llvm/tools/llvm-cov/llvm-cov.cpp
+++ b/llvm/tools/llvm-cov/llvm-cov.cpp
@@ -71,8 +71,8 @@ int main(int argc, const char **argv) {
.Case("gcov", gcovMain)
.Case("report", reportMain)
.Case("show", showMain)
- .Cases("-h", "-help", "--help", helpMain)
- .Cases("-version", "--version", versionMain)
+ .Cases({"-h", "-help", "--help"}, helpMain)
+ .Cases({"-version", "--version"}, versionMain)
.Default(nullptr);
if (Func) {
diff --git a/llvm/unittests/ADT/DenseMapTest.cpp b/llvm/unittests/ADT/DenseMapTest.cpp
index 50e9c6e..aceb4f3 100644
--- a/llvm/unittests/ADT/DenseMapTest.cpp
+++ b/llvm/unittests/ADT/DenseMapTest.cpp
@@ -70,6 +70,16 @@ public:
int getValue() const { return Value; }
bool operator==(const CtorTester &RHS) const { return Value == RHS.Value; }
+
+ // Return the number of live CtorTester objects, excluding the empty and
+ // tombstone keys.
+ static size_t getNumConstructed() {
+ return std::count_if(Constructed.begin(), Constructed.end(),
+ [](const CtorTester *Obj) {
+ int V = Obj->getValue();
+ return V != -1 && V != -2;
+ });
+ }
};
std::set<CtorTester *> CtorTester::Constructed;
@@ -1031,4 +1041,64 @@ TEST(SmallDenseMapCustomTest, InitSize) {
}
}
+TEST(DenseMapCustomTest, KeyDtor) {
+ // This test relies on CtorTester being non-trivially destructible.
+ static_assert(!std::is_trivially_destructible_v<CtorTester>,
+ "CtorTester must not be trivially destructible");
+
+ // Test that keys are destructed on scope exit.
+ EXPECT_EQ(0u, CtorTester::getNumConstructed());
+ {
+ DenseMap<CtorTester, int, CtorTesterMapInfo> Map;
+ Map.try_emplace(CtorTester(0), 1);
+ Map.try_emplace(CtorTester(1), 2);
+ EXPECT_EQ(2u, CtorTester::getNumConstructed());
+ }
+ EXPECT_EQ(0u, CtorTester::getNumConstructed());
+
+ // Test that keys are destructed on erase and shrink_and_clear.
+ EXPECT_EQ(0u, CtorTester::getNumConstructed());
+ {
+ DenseMap<CtorTester, int, CtorTesterMapInfo> Map;
+ Map.try_emplace(CtorTester(0), 1);
+ Map.try_emplace(CtorTester(1), 2);
+ EXPECT_EQ(2u, CtorTester::getNumConstructed());
+ Map.erase(CtorTester(1));
+ EXPECT_EQ(1u, CtorTester::getNumConstructed());
+ Map.shrink_and_clear();
+ EXPECT_EQ(0u, CtorTester::getNumConstructed());
+ }
+ EXPECT_EQ(0u, CtorTester::getNumConstructed());
+}
+
+TEST(DenseMapCustomTest, ValueDtor) {
+ // This test relies on CtorTester being non-trivially destructible.
+ static_assert(!std::is_trivially_destructible_v<CtorTester>,
+ "CtorTester must not be trivially destructible");
+
+ // Test that values are destructed on scope exit.
+ EXPECT_EQ(0u, CtorTester::getNumConstructed());
+ {
+ DenseMap<int, CtorTester> Map;
+ Map.try_emplace(0, CtorTester(1));
+ Map.try_emplace(1, CtorTester(2));
+ EXPECT_EQ(2u, CtorTester::getNumConstructed());
+ }
+ EXPECT_EQ(0u, CtorTester::getNumConstructed());
+
+ // Test that values are destructed on erase and shrink_and_clear.
+ EXPECT_EQ(0u, CtorTester::getNumConstructed());
+ {
+ DenseMap<int, CtorTester> Map;
+ Map.try_emplace(0, CtorTester(1));
+ Map.try_emplace(1, CtorTester(2));
+ EXPECT_EQ(2u, CtorTester::getNumConstructed());
+ Map.erase(1);
+ EXPECT_EQ(1u, CtorTester::getNumConstructed());
+ Map.shrink_and_clear();
+ EXPECT_EQ(0u, CtorTester::getNumConstructed());
+ }
+ EXPECT_EQ(0u, CtorTester::getNumConstructed());
+}
+
} // namespace
diff --git a/llvm/unittests/ADT/StringSwitchTest.cpp b/llvm/unittests/ADT/StringSwitchTest.cpp
index d88a0ff..c94feb5 100644
--- a/llvm/unittests/ADT/StringSwitchTest.cpp
+++ b/llvm/unittests/ADT/StringSwitchTest.cpp
@@ -157,7 +157,7 @@ TEST(StringSwitchTest, Cases) {
auto Translate = [](StringRef S) {
return llvm::StringSwitch<OSType>(S)
- .Cases(StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt",
+ .Cases({StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt"},
OSType::Windows)
.Cases({"linux", "unix", "*nix", "posix"}, OSType::Linux)
.Cases({"macos", "osx"}, OSType::MacOS)
@@ -189,7 +189,7 @@ TEST(StringSwitchTest, CasesLower) {
auto Translate = [](StringRef S) {
return llvm::StringSwitch<OSType>(S)
- .CasesLower(StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt",
+ .CasesLower({StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt"},
OSType::Windows)
.CasesLower({"linux", "unix", "*nix", "posix"}, OSType::Linux)
.CasesLower({"macos", "osx"}, OSType::MacOS)
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index b570f8d..151b065 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -71,6 +71,7 @@ CodeGen/Hexagon/autohvx/interleave.ll
CodeGen/Hexagon/loop-idiom/hexagon-memmove1.ll
CodeGen/Hexagon/loop-idiom/hexagon-memmove2.ll
CodeGen/Hexagon/loop-idiom/memmove-rt-check.ll
+CodeGen/Hexagon/masked_gather.ll
CodeGen/NVPTX/lower-ctor-dtor.ll
CodeGen/RISCV/zmmul.ll
CodeGen/WebAssembly/memory-interleave.ll
@@ -331,6 +332,10 @@ Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll
Instrumentation/MemorySanitizer/AArch64/module-flags-aarch64.ll
Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll
Instrumentation/MemorySanitizer/AArch64/qshrn.ll
+Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
+Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
+Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll
+Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll
Instrumentation/MemorySanitizer/AArch64/vararg.ll
Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll
Instrumentation/MemorySanitizer/abs-vector.ll
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 1437c39..37db096 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1111,9 +1111,9 @@ def AMDGPU_ScaledMFMAOp :
AMDGPU_Op<"scaled_mfma", [AllTypesMatch<["destC", "destD"]>,
Pure]>,
Arguments<(ins
- I32Attr:$m,
- I32Attr:$n,
- I32Attr:$k,
+ ConfinedAttr<I32Attr, [IntIsOneOf<[16, 32]>]>:$m,
+ ConfinedAttr<I32Attr, [IntIsOneOf<[16, 32]>]>:$n,
+ ConfinedAttr<I32Attr, [IntIsOneOf<[64, 128]>]>:$k,
ScaledMFMAInTypes:$sourceA,
ScaledMFMAInTypes:$sourceB,
ScaledMFMAOutTypes:$destC,
@@ -1126,8 +1126,8 @@ def AMDGPU_ScaledMFMAOp :
let summary = "MLIR wrapper for CDNA scaled mfma instructions";
let description = [{
The `amdgpu.scaled_mfma` op is an MLIR wrapper around intrinsics
- for various scaled versions of `mfma` instructions in the CDNA architecture, which perform
- multiple outer products in order to allow fast matrix multiplication.
+ for various scaled versions of `mfma` instructions in the CDNA architecture, which
+ perform multiple outer products in order to allow fast matrix multiplication.
The wrapper will select an appropriate `mfma` instruction, if one is available,
based on the provided `m`, `k`, `n`, and `nBlks` attributes, along with the
@@ -1142,15 +1142,23 @@ def AMDGPU_ScaledMFMAOp :
This wrapper takes inspiration from `amdgpu.mfma`, but has some key differences:
- `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and
- fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile
- size.
+ fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as
+ their tile size.
- `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp`
- are omitted from this wrapper.
- - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for
- double-precision operations on gfx94x and so are not included here.
+ are omitted from this wrapper.
+ - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported
+ for double-precision operations on gfx94x and so are not included here.
+
+ Example:
+ ```mlir
+ %0 = amdgpu.scaled_mfma 32x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2
+ : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32>
+ ```
}];
let assemblyFormat = [{
- `(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC
+ custom<MNKDimensionList>($m, $n, $k) ` `
+ `(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*`
+ `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC
attr-dict
`:` type($scalesA) `,` type($sourceA) `,` type($scalesB) `,` type($sourceB) `,` type($destC)
}];
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir b/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir
index 6de55d5..c746d76 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir
@@ -55,10 +55,10 @@ func.func @mfma_to_rocdl(%arg0 : vector<8xf16>, %arg1 : vector<16xf32>,
// CHECK-LABEL: func @scaled_mfma_to_rocdl(
// CHECK-SAME: %[[ARG0:.*]]: vector<16xf32>, %[[ARG1:.*]]: vector<4xf32>, %[[ARG2:.*]]: vector<32xf8E4M3FN>, %[[ARG3:.*]]: vector<32xf8E5M2>, %[[ARG4:.*]]: vector<32xf6E2M3FN>, %[[ARG5:.*]]: vector<32xf6E3M2FN>, %[[ARG6:.*]]: vector<32xf4E2M1FN>, %[[ARG7:.*]]: vector<4xf8E8M0FNU>, %[[ARG8:.*]]: f8E8M0FNU
func.func @scaled_mfma_to_rocdl(%arg0 : vector<16xf32>,
- %arg1 : vector<4xf32>, %arg2 : vector<32xf8E4M3FN>,
- %arg3 : vector<32xf8E5M2>, %arg4 : vector<32xf6E2M3FN>,
- %arg5 : vector<32xf6E3M2FN>, %arg6 : vector<32xf4E2M1FN>,
- %arg7 : vector<4xf8E8M0FNU>, %arg8 : f8E8M0FNU) {
+ %arg1 : vector<4xf32>, %arg2 : vector<32xf8E4M3FN>,
+ %arg3 : vector<32xf8E5M2>, %arg4 : vector<32xf6E2M3FN>,
+ %arg5 : vector<32xf6E3M2FN>, %arg6 : vector<32xf4E2M1FN>,
+ %arg7 : vector<4xf8E8M0FNU>, %arg8 : f8E8M0FNU) {
// CHECK: %[[c0:.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: %[[c1:.+]] = llvm.mlir.constant(1 : i32) : i32
@@ -66,39 +66,39 @@ func.func @scaled_mfma_to_rocdl(%arg0 : vector<16xf32>,
// CHECK: %[[z0:.+]] = llvm.zext {{.*}} : i8 to i32
// CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
- amdgpu.scaled_mfma(%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<16xf32>
+ amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<16xf32>
// CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
- amdgpu.scaled_mfma(%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<4xf32>
+ amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<4xf32>
// CHECK: llvm.bitcast
// CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
- amdgpu.scaled_mfma(%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<16xf32>
+ amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<16xf32>
// CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
- amdgpu.scaled_mfma(%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<4xf32>
+ amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<4xf32>
// CHECK: llvm.bitcast
// CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
- amdgpu.scaled_mfma(%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32>
+ amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32>
// CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
- amdgpu.scaled_mfma(%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<4xf32>
+ amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<4xf32>
// CHECK: llvm.bitcast
// CHECK: llvm.mlir.constant(3 : i32) : i32
// CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
- amdgpu.scaled_mfma(%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<16xf32>
+ amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<16xf32>
// CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
- amdgpu.scaled_mfma(%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<4xf32>
+ amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<4xf32>
// CHECK: llvm.bitcast
// CHECK: llvm.mlir.constant(4 : i32) : i32
// CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<4xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
- amdgpu.scaled_mfma(%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<16xf32>
+ amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<16xf32>
// CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<4xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
- amdgpu.scaled_mfma(%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<4xf32>
+ amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<4xf32>
func.return
}
diff --git a/mlir/test/Dialect/AMDGPU/canonicalize.mlir b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
index 52d3275..fee0c00 100644
--- a/mlir/test/Dialect/AMDGPU/canonicalize.mlir
+++ b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
@@ -165,10 +165,10 @@ func.func @fold_gather_to_lds_of_cast_dest(%global: memref<128x72xf32, 1>, %lds:
// CHECK-LABEL: func @scaled_mfma
// CHECK: %[[SCALE_1:.*]] = vector.extract_strided_slice %0 {offsets = [0], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU>
// CHECK: %[[SCALE_2:.*]] = vector.extract_strided_slice %2 {offsets = [4], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU>
-// CHECK: amdgpu.scaled_mfma(%[[SCALE_1]][3] * %{{.*}}) * (%[[SCALE_2]][2] * %{{.*}}) {{.*}}
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%[[SCALE_1]][3] * %{{.*}}) * (%[[SCALE_2]][2] * %{{.*}}) {{.*}}
// CHECK: %[[SCALE_3:.*]] = vector.extract_strided_slice %5 {offsets = [8], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU>
// CHECK: %[[SCALE_4:.*]] = vector.extract_strided_slice %7 {offsets = [12], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU>
-// CHECK: amdgpu.scaled_mfma(%[[SCALE_3]][1] * %{{.*}}) * (%[[SCALE_4]][0] * %{{.*}}) {{.*}}
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%[[SCALE_3]][1] * %{{.*}}) * (%[[SCALE_4]][0] * %{{.*}}) {{.*}}
func.func @scaled_mfma(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %scalesA: vector<2x1x8x1xf8E8M0FNU>, %scalesB: vector<2x1x8x1xf8E8M0FNU>) -> (vector<4xf32>, vector<4xf32>) {
%cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32>
%cst_1 = arith.constant dense<5.877470e-39> : vector<4xf8E8M0FNU>
@@ -176,12 +176,12 @@ func.func @scaled_mfma(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %sc
%sA = vector.insert %scaleA, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
%scaleB = vector.extract %scalesB[0, 0, 6, 0] : f8E8M0FNU from vector<2x1x8x1xf8E8M0FNU>
%sB = vector.insert %scaleB, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
- %res_0 = amdgpu.scaled_mfma(%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+ %res_0 = amdgpu.scaled_mfma 16x16x128 (%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
%scaleC = vector.extract %scalesA[1, 0, 1, 0] : f8E8M0FNU from vector<2x1x8x1xf8E8M0FNU>
%sC = vector.insert %scaleC, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
%scaleD = vector.extract %scalesB[1, 0, 4, 0] : f8E8M0FNU from vector<2x1x8x1xf8E8M0FNU>
%sD = vector.insert %scaleD, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
- %res_1 = amdgpu.scaled_mfma(%sC[0] * %opA) * (%sD[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+ %res_1 = amdgpu.scaled_mfma 16x16x128 (%sC[0] * %opA) * (%sD[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
return %res_0, %res_1 : vector<4xf32>, vector<4xf32>
}
@@ -192,7 +192,7 @@ func.func @scaled_mfma(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %sc
// CHECK: vector.insert {{.*}} : f8E8M0FNU into vector<4xf8E8M0FNU>
// CHECK: vector.extract {{.*}} : f8E8M0FNU from vector<2xf8E8M0FNU>
// CHECK: vector.insert {{.*}} : f8E8M0FNU into vector<4xf8E8M0FNU>
-// CHECK: amdgpu.scaled_mfma({{.*}}[0] * {{.*}}) * ({{.*}}[0] * {{.*}}
+// CHECK: amdgpu.scaled_mfma 16x16x128 ({{.*}}[0] * {{.*}}) * ({{.*}}[0] * {{.*}}
func.func @scaled_mfma_less_than_4(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %scalesA: vector<2xf8E8M0FNU>, %scalesB: vector<2xf8E8M0FNU>) -> vector<4xf32> {
%cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32>
%cst_1 = arith.constant dense<5.877470e-39> : vector<4xf8E8M0FNU>
@@ -200,17 +200,17 @@ func.func @scaled_mfma_less_than_4(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4
%sA = vector.insert %scaleA, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
%scaleB = vector.extract %scalesB[1] : f8E8M0FNU from vector<2xf8E8M0FNU>
%sB = vector.insert %scaleB, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
- %res_0 = amdgpu.scaled_mfma(%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+ %res_0 = amdgpu.scaled_mfma 16x16x128 (%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
return %res_0 : vector<4xf32>
}
// -----
// CHECK-LABEL: func @scaled_mfma_ugly_shapes
-// CHECK: amdgpu.scaled_mfma(%{{.*}}[0] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
-// CHECK: amdgpu.scaled_mfma(%{{.*}}[1] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
-// CHECK: amdgpu.scaled_mfma(%{{.*}}[2] * %{{.*}}) * (%{{.*}}[2] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
-// CHECK: amdgpu.scaled_mfma(%{{.*}}[3] * %{{.*}}) * (%{{.*}}[1] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[0] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[1] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[2] * %{{.*}}) * (%{{.*}}[2] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[3] * %{{.*}}) * (%{{.*}}[1] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
func.func @scaled_mfma_ugly_shapes(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %scalesA: vector<5x5xf8E8M0FNU>, %scalesB: vector<7x23xf8E8M0FNU>) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32>
%cst_1 = arith.constant dense<5.877470e-39> : vector<4xf8E8M0FNU>
@@ -237,10 +237,10 @@ func.func @scaled_mfma_ugly_shapes(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4
%sB_6_21 = vector.insert %scaleB_6_21, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
%sB_6_20 = vector.insert %scaleB_6_20, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
%sB_6_19 = vector.insert %scaleB_6_19, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
-
- %res_4 = amdgpu.scaled_mfma(%sA_0_4[0] * %opA) * (%sB_6_22[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
- %res_5 = amdgpu.scaled_mfma(%sA_0_5[0] * %opA) * (%sB_6_21[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
- %res_6 = amdgpu.scaled_mfma(%sA_0_6[0] * %opA) * (%sB_6_20[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
- %res_7 = amdgpu.scaled_mfma(%sA_0_7[0] * %opA) * (%sB_6_19[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+
+ %res_4 = amdgpu.scaled_mfma 16x16x128 (%sA_0_4[0] * %opA) * (%sB_6_22[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+ %res_5 = amdgpu.scaled_mfma 16x16x128 (%sA_0_5[0] * %opA) * (%sB_6_21[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+ %res_6 = amdgpu.scaled_mfma 16x16x128 (%sA_0_6[0] * %opA) * (%sB_6_20[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+ %res_7 = amdgpu.scaled_mfma 16x16x128 (%sA_0_7[0] * %opA) * (%sB_6_19[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
return %res_4, %res_5, %res_6, %res_7 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index b27dfdf..5784764 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -314,3 +314,27 @@ func.func @amdgpu.scaled_ext_packed816_invalid_input_output_sizes(%v: vector<8xf
%ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<16xf16>
func.return
}
+
+// -----
+
+func.func @scaled_mfma_invalid_m(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
+ // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'm' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}}
+ %0 = amdgpu.scaled_mfma 8x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32>
+ func.return %0 : vector<16xf32>
+}
+
+// -----
+
+func.func @scaled_mfma_invalid_n(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
+ // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'n' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}}
+ %0 = amdgpu.scaled_mfma 32x8x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32>
+ func.return %0 : vector<16xf32>
+}
+
+// -----
+
+func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
+ // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'k' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {64, 128}}}
+ %0 = amdgpu.scaled_mfma 32x32x32 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32>
+ func.return %0 : vector<16xf32>
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index d0bf0d7..a330967 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -609,8 +609,8 @@ func.func @permlane32_swap(%arg0 : f32) -> f32 {
// CHECK-LABEL: func @scaled_mfma
func.func @scaled_mfma(%arg0 : f8E8M0FNU, %arg1 : vector<32xf6E2M3FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
- // CHECK: amdgpu.scaled_mfma
- %0 = amdgpu.scaled_mfma(%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : f8E8M0FNU, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32>
+ // CHECK: amdgpu.scaled_mfma 32x32x64
+ %0 = amdgpu.scaled_mfma 32x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : f8E8M0FNU, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32>
func.return %0 : vector<16xf32>
}