diff options
67 files changed, 657 insertions, 524 deletions
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index 06e840e..e1a1856 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -346,13 +346,16 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) { uint64_t Bytes; Function.markUnreachableBlocks(); LLVM_DEBUG({ + bool HasInvalidBB = false; for (BinaryBasicBlock &BB : Function) { if (!BB.isValid()) { + HasInvalidBB = true; dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName() << " in function " << Function << "\n"; - Function.dump(); } } + if (HasInvalidBB) + Function.dump(); }); BinaryContext::IndependentCodeEmitter Emitter = BC.createIndependentMCCodeEmitter(); diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index bef6e9c1..495f2ab 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -32,7 +32,7 @@ Feature Checking Macros ======================= Language extensions can be very useful, but only if you know you can depend on -them. In order to allow fine-grain features checks, we support three builtin +them. In order to allow fine-grained feature checks, we support three builtin function-like macros. This allows you to directly test for a feature in your code without having to resort to something like autoconf or fragile "compiler version checks". @@ -137,7 +137,7 @@ feature) or 0 if not. They can be used like this: .. _langext-has-feature-back-compat: For backward compatibility, ``__has_feature`` can also be used to test -for support for non-standardized features, i.e. features not prefixed ``c_``, +for support for non-standardized features, i.e., features not prefixed ``c_``, ``cxx_`` or ``objc_``. Another use of ``__has_feature`` is to check for compiler features not related @@ -159,7 +159,7 @@ of ``cxx_rvalue_references``. This function-like macro is available in C++20 by default, and is provided as an extension in earlier language standards. It takes a single argument that is the -name of a double-square-bracket-style attribute. The argument can either be a +name of a double-square-bracket-style attribute. The argument can be either a single identifier or a scoped identifier. If the attribute is supported, a nonzero value is returned. If the attribute is a standards-based attribute, this macro returns a nonzero value based on the year and month in which the attribute @@ -320,7 +320,7 @@ To test for this feature, use ``#if defined(__has_include)``: .. code-block:: c++ - // To avoid problem with non-clang compilers not having this macro. + // To avoid problems with non-clang compilers not having this macro. #if defined(__has_include) #if __has_include("myinclude.h") # include "myinclude.h" @@ -345,7 +345,7 @@ or 0 otherwise: # include_next "myinclude.h" #endif - // To avoid problem with non-clang compilers not having this macro. + // To avoid problems with non-clang compilers not having this macro. #if defined(__has_include_next) #if __has_include_next("myinclude.h") # include_next "myinclude.h" @@ -361,7 +361,7 @@ absolute path is used in the file argument. ----------------- This function-like macro takes a string literal that represents a command line -option for a warning and returns true if that is a valid warning option. +option for a warning and returns ``true`` if that is a valid warning option. .. code-block:: c++ @@ -396,7 +396,7 @@ Builtin Macros file. ``__clang__`` - Defined when compiling with Clang + Defined when compiling with Clang. ``__clang_major__`` Defined to the major marketing version number of Clang (e.g., the 2 in @@ -599,7 +599,7 @@ NEON vector types are created using ``neon_vector_type`` and GCC vector types are created using the ``vector_size(N)`` attribute. The argument ``N`` specifies the number of bytes that will be allocated for an -object of this type. The size has to be multiple of the size of the vector +object of this type. The size has to be a multiple of the size of the vector element type. For example: .. code-block:: c++ @@ -620,7 +620,7 @@ element type. For example: Boolean Vectors --------------- -Clang also supports the ext_vector_type attribute with boolean element types in +Clang also supports the ``ext_vector_type`` attribute with boolean element types in C and C++. For example: .. code-block:: c++ @@ -674,7 +674,7 @@ Vector Literals Vector literals can be used to create vectors from a set of scalars, or vectors. Either parentheses or braces form can be used. In the parentheses -form the number of literal values specified must be one, i.e. referring to a +form the number of literal values specified must be one, i.e., referring to a scalar value, or must match the size of the vector type being created. If a single scalar literal value is specified, the scalar literal value will be replicated to all the components of the vector type. In the brackets form any @@ -725,12 +725,12 @@ address &v[i] no no no [#]_ no no See also :ref:`langext-__builtin_shufflevector`, :ref:`langext-__builtin_convertvector`. -.. [#] ternary operator(?:) has different behaviors depending on condition - operand's vector type. If the condition is a GNU vector (i.e. __vector_size__), +.. [#] ternary operator(?:) has different behaviors depending on the condition + operand's vector type. If the condition is a GNU vector (i.e., ``__vector_size__``), a NEON vector or an SVE vector, it's only available in C++ and uses normal bool conversions (that is, != 0). If it's an extension (OpenCL) vector, it's only available in C and OpenCL C. - And it selects based on signedness of the condition operands (OpenCL v1.1 s6.3.9). + And it selects based on the signedness of the condition operands (OpenCL v1.1 s6.3.9). .. [#] sizeof can only be used on vector length specific SVE types. .. [#] Clang does not allow the address of an element to be taken while GCC allows this. This is intentional for vectors with a boolean element type and @@ -747,7 +747,7 @@ to perform additional operations on certain scalar and vector types. Let ``T`` be one of the following types: * an integer type (as in C23 6.2.5p22), but excluding enumerated types and ``bool`` -* the standard floating types float or double +* the standard floating types ``float`` or ``double`` * a half-precision floating point type, if one is supported on the target * a vector type. @@ -833,7 +833,7 @@ of different sizes and signs is forbidden in binary and ternary builtins. T __builtin_elementwise_canonicalize(T x) return the platform specific canonical encoding floating point types of a floating-point number T __builtin_elementwise_copysign(T x, T y) return the magnitude of x with the sign of y. floating point types - T __builtin_elementwise_fmod(T x, T y) return The floating-point remainder of (x/y) whose sign floating point types + T __builtin_elementwise_fmod(T x, T y) return the floating-point remainder of (x/y) whose sign floating point types matches the sign of x. T __builtin_elementwise_max(T x, T y) return x or y, whichever is larger integer and floating point types For floating point types, follows semantics of maxNum @@ -948,7 +948,7 @@ Let ``VT`` be a vector type and ``ET`` the element type of ``VT``. Each builtin accesses memory according to a provided boolean mask. These are provided as ``__builtin_masked_load`` and ``__builtin_masked_store``. The first -argument is always boolean mask vector. The ``__builtin_masked_load`` builtin +argument is always a boolean mask vector. The ``__builtin_masked_load`` builtin takes an optional third vector argument that will be used for the result of the masked-off lanes. These builtins assume the memory is unaligned, use ``__builtin_assume_aligned`` if alignment is desired. @@ -1109,7 +1109,7 @@ to ``float``; see below for more information on this emulation. ``__fp16`` and ``_Float16`` both use the binary16 format from IEEE 754-2008, which provides a 5-bit exponent and an 11-bit significand -(counting the implicit leading 1). ``__bf16`` uses the `bfloat16 +(including the implicit leading 1). ``__bf16`` uses the `bfloat16 <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format>`_ format, which provides an 8-bit exponent and an 8-bit significand; this is the same exponent range as `float`, just with greatly reduced precision. @@ -1143,7 +1143,7 @@ types with the ``-ffloat16-excess-precision=`` and * ``none``: meaning to perform strict operation-by-operation emulation * ``standard``: meaning that excess precision is permitted under the rules - described in the standard, i.e. never across explicit casts or statements + described in the standard, i.e., never across explicit casts or statements * ``fast``: meaning that excess precision is permitted whenever the optimizer sees an opportunity to avoid truncations; currently this has no effect beyond ``standard`` @@ -1899,7 +1899,7 @@ Type Trait Primitives Type trait primitives are special builtin constant expressions that can be used by the standard C++ library to facilitate or simplify the implementation of -user-facing type traits in the <type_traits> header. +user-facing type traits in the ``<type_traits>`` header. They are not intended to be used directly by user code because they are implementation-defined and subject to change -- as such they're tied closely to @@ -2054,7 +2054,7 @@ The following type trait primitives are supported by Clang. Those traits marked * ``__is_volatile`` (C++, Embarcadero) * ``__reference_binds_to_temporary(T, U)`` (Clang): Determines whether a reference of type ``T`` bound to an expression of type ``U`` would bind to a - materialized temporary object. If ``T`` is not a reference type the result + materialized temporary object. If ``T`` is not a reference type, the result is false. Note this trait will also return false when the initialization of ``T`` from ``U`` is ill-formed. Deprecated, use ``__reference_constructs_from_temporary``. @@ -2182,7 +2182,7 @@ Prior to clang-16, the output may only be used safely when the indirect branches are not taken. Query for this difference with ``__has_extension(gnu_asm_goto_with_outputs_full)``. -When using tied-outputs (i.e. outputs that are inputs and outputs, not just +When using tied-outputs (i.e., outputs that are inputs and outputs, not just outputs) with the `+r` constraint, there is a hidden input that's created before the label, so numeric references to operands must account for that. @@ -2361,7 +2361,7 @@ Loading from a ``__weak`` variable always implicitly retains the loaded value. In non-ARC modes, this retain is normally balanced by an implicit autorelease. This autorelease can be suppressed by performing the load in the receiver position of a ``-retain`` -message send (e.g. ``[weakReference retain]``); note that this performs +message send (e.g., ``[weakReference retain]``); note that this performs only a single retain (the retain done when primitively loading from the weak reference). @@ -2562,7 +2562,7 @@ When a method that's introduced in the OS newer than the target OS is called, a .. code-block:: objc void my_fun(NSSomeClass* var) { - // If fancyNewMethod was added in e.g. macOS 10.12, but the code is + // If fancyNewMethod was added in e.g., macOS 10.12, but the code is // built with -mmacos-version-min=10.11, then this unconditional call // will emit a -Wunguarded-availability warning: [var fancyNewMethod]; @@ -2725,7 +2725,7 @@ correctly in any circumstances. It can be used if: metaprogramming algorithms to be able to specify/detect types generically. - the generated kernel binary does not contain indirect calls because they - are eliminated using compiler optimizations e.g. devirtualization. + are eliminated using compiler optimizations e.g., devirtualization. - the selected target supports the function pointer like functionality e.g. most CPU targets. @@ -2755,12 +2755,12 @@ Extension Specification, section 1.2 <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_Ext.html#extensions-overview>`_. This is not conformant behavior and it can only be used portably when the -functions with variadic prototypes do not get generated in binary e.g. the +functions with variadic prototypes do not get generated in binary e.g., the variadic prototype is used to specify a function type with any number of arguments in metaprogramming algorithms in C++ for OpenCL. This extension can also be used when the kernel code is intended for targets -supporting the variadic arguments e.g. majority of CPU targets. +supporting the variadic arguments e.g., majority of CPU targets. **Example of Use**: @@ -5170,8 +5170,8 @@ __builtin_amdgcn_fence ``__builtin_amdgcn_fence`` emits a fence. -* ``unsigned`` atomic ordering, e.g. ``__ATOMIC_ACQUIRE`` -* ``const char *`` synchronization scope, e.g. ``workgroup`` +* ``unsigned`` atomic ordering, e.g., ``__ATOMIC_ACQUIRE`` +* ``const char *`` synchronization scope, e.g., ``workgroup`` * Zero or more ``const char *`` address spaces names. The address spaces arguments must be one of the following string literals: @@ -5205,7 +5205,7 @@ boolean argument as a bit for every lane of the current wave that is currently active (i.e., that is converged with the executing thread), and a 0 bit for every lane that is not active. -The result is uniform, i.e. it is the same in every active thread of the wave. +The result is uniform, i.e., it is the same in every active thread of the wave. __builtin_amdgcn_inverse_ballot_w{32,64} ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -5374,7 +5374,7 @@ followed by ``off`` or ``on``. All function definitions in the region between an ``off`` and the following ``on`` will be decorated with the ``optnone`` attribute unless doing so would -conflict with explicit attributes already present on the function (e.g. the +conflict with explicit attributes already present on the function (e.g., the ones that control inlining). .. code-block:: c++ @@ -5472,12 +5472,12 @@ provides options for vectorization, interleaving, predication, unrolling and distribution. Loop hints can be specified before any loop and will be ignored if the optimization is not safe to apply. -There are loop hints that control transformations (e.g. vectorization, loop +There are loop hints that control transformations (e.g., vectorization, loop unrolling) and there are loop hints that set transformation options (e.g. ``vectorize_width``, ``unroll_count``). Pragmas setting transformation options -imply the transformation is enabled, as if it was enabled via the corresponding -transformation pragma (e.g. ``vectorize(enable)``). If the transformation is -disabled (e.g. ``vectorize(disable)``), that takes precedence over +imply the transformation is enabled, as if it were enabled via the corresponding +transformation pragma (e.g., ``vectorize(enable)``). If the transformation is +disabled (e.g., ``vectorize(disable)``), that takes precedence over transformations option pragmas implying that transformation. Vectorization, Interleaving, and Predication @@ -5704,7 +5704,7 @@ operation when supported by the target. The pragma can take three values: ``on``, ``fast`` and ``off``. The ``on`` option is identical to using ``#pragma STDC FP_CONTRACT(ON)`` and it allows fusion as specified the language standard. The ``fast`` option allows fusion -in cases when the language standard does not make this possible (e.g. across +in cases when the language standard does not make this possible (e.g., across statements in C). .. code-block:: c++ @@ -5859,14 +5859,14 @@ and ``ignore_denormal_mode``, each optionally prefixed with ``no_``. The meaning are as follows: - ``remote_memory`` means atomic operations may be performed on remote - memory, i.e. memory accessed through off-chip interconnects (e.g., PCIe). + memory, i.e., memory accessed through off-chip interconnects (e.g., PCIe). On ROCm platforms using HIP, remote memory refers to memory accessed via PCIe and is subject to specific atomic operation support. See `ROCm PCIe Atomics <https://rocm.docs.amd.com/en/latest/conceptual/ pcie-atomics.html>`_ for further details. Prefixing with ``no_remote_memory`` indicates that atomic operations should not be performed on remote memory. - ``fine_grained_memory`` means atomic operations may be performed on fine-grained - memory, i.e. memory regions that support fine-grained coherence, where updates to + memory, i.e., memory regions that support fine-grained coherence, where updates to memory are visible to other parts of the system even while modifications are ongoing. For example, in HIP, fine-grained coherence ensures that host and device share up-to-date data without explicit synchronization (see @@ -6088,7 +6088,7 @@ match rules are specified after the attribute. The compiler expects an identifier that corresponds to the subject set specifier. The ``apply_to`` specifier is currently the only supported subject set specifier. It allows you to specify match rules that form a subset of the attribute's allowed subject -set, i.e. the compiler doesn't require all of the attribute's subjects. For +set, i.e., the compiler doesn't require all of the attribute's subjects. For example, an attribute like ``[[nodiscard]]`` whose subject set includes ``enum``, ``record`` and ``hasType(functionType)``, requires the presence of at least one of these rules after ``apply_to``: diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index c2d1bc1..a97e934 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -811,10 +811,10 @@ bool AArch64TargetInfo::validateCpuSupports(StringRef FeatureStr) const { bool AArch64TargetInfo::hasFeature(StringRef Feature) const { return llvm::StringSwitch<bool>(Feature) - .Cases("aarch64", "arm64", "arm", true) + .Cases({"aarch64", "arm64", "arm"}, true) .Case("fmv", HasFMV) .Case("fp", FPU & FPUMode) - .Cases("neon", "simd", FPU & NeonMode) + .Cases({"neon", "simd"}, FPU & NeonMode) .Case("jscvt", HasJSCVT) .Case("fcma", HasFCMA) .Case("rng", HasRandGen) @@ -829,8 +829,8 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const { .Case("cssc", HasCSSC) .Case("sha2", HasSHA2) .Case("sha3", HasSHA3) - .Cases("aes", "pmull", HasAES) - .Cases("fp16", "fullfp16", HasFullFP16) + .Cases({"aes", "pmull"}, HasAES) + .Cases({"fp16", "fullfp16"}, HasFullFP16) .Case("dit", HasDIT) .Case("dpb", HasCCPP) .Case("dpb2", HasCCDP) @@ -859,9 +859,9 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const { .Case("memtag", HasMTE) .Case("sb", HasSB) .Case("predres", HasPredRes) - .Cases("ssbs", "ssbs2", HasSSBS) + .Cases({"ssbs", "ssbs2"}, HasSSBS) .Case("bti", HasBTI) - .Cases("ls64", "ls64_v", "ls64_accdata", HasLS64) + .Cases({"ls64", "ls64_v", "ls64_accdata"}, HasLS64) .Case("wfxt", HasWFxT) .Case("rcpc3", HasRCPC3) .Case("fp8", HasFP8) diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h index 9f3a4cd..846b240 100644 --- a/clang/lib/Basic/Targets/PPC.h +++ b/clang/lib/Basic/Targets/PPC.h @@ -122,41 +122,41 @@ public: .Case("970", ArchDefineName | ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) .Case("a2", ArchDefineA2) - .Cases("power3", "pwr3", ArchDefinePpcgr) - .Cases("power4", "pwr4", + .Cases({"power3", "pwr3"}, ArchDefinePpcgr) + .Cases({"power4", "pwr4"}, ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) .Cases("power5", "pwr5", ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) - .Cases("power5x", "pwr5x", + .Cases({"power5x", "pwr5x"}, ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) - .Cases("power6", "pwr6", - ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 | - ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) - .Cases("power6x", "pwr6x", + .Cases({"power6", "pwr6"}, ArchDefinePwr6 | ArchDefinePwr5x | + ArchDefinePwr5 | ArchDefinePwr4 | + ArchDefinePpcgr | ArchDefinePpcsq) + .Cases({"power6x", "pwr6x"}, ArchDefinePwr6x | ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) - .Cases("power7", "pwr7", - ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x | - ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr | - ArchDefinePpcsq) + .Cases({"power7", "pwr7"}, ArchDefinePwr7 | ArchDefinePwr6 | + ArchDefinePwr5x | ArchDefinePwr5 | + ArchDefinePwr4 | ArchDefinePpcgr | + ArchDefinePpcsq) // powerpc64le automatically defaults to at least power8. - .Cases("power8", "pwr8", "ppc64le", + .Cases({"power8", "pwr8", "ppc64le"}, ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) - .Cases("power9", "pwr9", + .Cases({"power9", "pwr9"}, ArchDefinePwr9 | ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) - .Cases("power10", "pwr10", + .Cases({"power10", "pwr10"}, ArchDefinePwr10 | ArchDefinePwr9 | ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) - .Cases("power11", "pwr11", + .Cases({"power11", "pwr11"}, ArchDefinePwr11 | ArchDefinePwr10 | ArchDefinePwr9 | ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 | diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index d2cb751..bd36eb4 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3214,7 +3214,7 @@ static bool ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args, DashX = llvm::StringSwitch<InputKind>(XValue) .Case("cpp-output", InputKind(Language::C).getPreprocessed()) .Case("assembler-with-cpp", Language::Asm) - .Cases("ast", "pcm", "precompiled-header", + .Cases({"ast", "pcm", "precompiled-header"}, InputKind(Language::Unknown, InputKind::Precompiled)) .Case("ir", Language::LLVM_IR) .Case("cir", Language::CIR) diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp index 5b08d7f..1c06152 100644 --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -1283,10 +1283,10 @@ bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts, // Per tweaked N3660, "il", "i", and "if" are also used in the library. // In C++2a "d" and "y" are used in the library. return llvm::StringSwitch<bool>(Suffix) - .Cases("h", "min", "s", true) - .Cases("ms", "us", "ns", true) - .Cases("il", "i", "if", true) - .Cases("d", "y", LangOpts.CPlusPlus20) + .Cases({"h", "min", "s"}, true) + .Cases({"ms", "us", "ns"}, true) + .Cases({"il", "i", "if"}, true) + .Cases({"d", "y"}, LangOpts.CPlusPlus20) .Default(false); } diff --git a/llvm/docs/CIBestPractices.rst b/llvm/docs/CIBestPractices.rst index da92ed3..855e2cc 100644 --- a/llvm/docs/CIBestPractices.rst +++ b/llvm/docs/CIBestPractices.rst @@ -136,3 +136,20 @@ branches as follows: branches: - main - releases/* + +Container Best Practices +======================== + +This section contains best practices/guidelines when working with containers +for LLVM infrastructure. + +Using Fully Qualified Container Names +------------------------------------- + +When referencing container images from a registry, such as in Github Actions +workflows, or in ``Dockerfile`` files used for building images, prefer fully +qualified names (i.e., including the registry domain) over just the image. +For example, prefer ``docker.io/ubuntu:24.04`` over ``ubuntu:24.04``. This +ensures portability across systems where a different default registry might +be specified and also prevents attackers from changing the default registry +to pull in a malicious image instead of the intended one. diff --git a/llvm/include/llvm/ADT/Bitfields.h b/llvm/include/llvm/ADT/Bitfields.h index 1fbc41c..be9546e 100644 --- a/llvm/include/llvm/ADT/Bitfields.h +++ b/llvm/include/llvm/ADT/Bitfields.h @@ -100,8 +100,8 @@ template <typename Bitfield, typename StorageType> struct Impl { using IntegerType = typename Bitfield::IntegerType; static constexpr size_t StorageBits = sizeof(StorageType) * CHAR_BIT; - static_assert(Bitfield::FirstBit <= StorageBits, "Data must fit in mask"); - static_assert(Bitfield::LastBit <= StorageBits, "Data must fit in mask"); + static_assert(Bitfield::FirstBit < StorageBits, "Data must fit in mask"); + static_assert(Bitfield::LastBit < StorageBits, "Data must fit in mask"); static constexpr StorageType LowMask = maskTrailingOnes<StorageType>(Bitfield::Bits); static constexpr StorageType Mask = LowMask << Bitfield::Shift; diff --git a/llvm/include/llvm/ADT/ConcurrentHashtable.h b/llvm/include/llvm/ADT/ConcurrentHashtable.h index 6a943c5..0cc03cf 100644 --- a/llvm/include/llvm/ADT/ConcurrentHashtable.h +++ b/llvm/include/llvm/ADT/ConcurrentHashtable.h @@ -177,7 +177,7 @@ public: #if LLVM_ENABLE_THREADS // Lock bucket. - CurBucket.Guard.lock(); + std::scoped_lock<std::mutex> Lock(CurBucket.Guard); #endif HashesPtr BucketHashes = CurBucket.Hashes; @@ -195,11 +195,6 @@ public: CurBucket.NumberOfEntries++; RehashBucket(CurBucket); - -#if LLVM_ENABLE_THREADS - CurBucket.Guard.unlock(); -#endif - return {NewData, true}; } @@ -208,10 +203,6 @@ public: KeyDataTy *EntryData = BucketEntries[CurEntryIdx]; if (Info::isEqual(Info::getKey(*EntryData), NewValue)) { // Already existed entry matched with inserted data is found. -#if LLVM_ENABLE_THREADS - CurBucket.Guard.unlock(); -#endif - return {EntryData, false}; } } diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h index 25b5262..22ef7ed 100644 --- a/llvm/include/llvm/ADT/DenseMap.h +++ b/llvm/include/llvm/ADT/DenseMap.h @@ -130,13 +130,13 @@ public: return; } - const KeyT EmptyKey = getEmptyKey(); + const KeyT EmptyKey = KeyInfoT::getEmptyKey(); if constexpr (std::is_trivially_destructible_v<ValueT>) { // Use a simpler loop when values don't need destruction. for (BucketT &B : buckets()) B.getFirst() = EmptyKey; } else { - const KeyT TombstoneKey = getTombstoneKey(); + const KeyT TombstoneKey = KeyInfoT::getTombstoneKey(); unsigned NumEntries = getNumEntries(); for (BucketT &B : buckets()) { if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey)) { @@ -154,6 +154,17 @@ public: setNumTombstones(0); } + void shrink_and_clear() { + auto [Reallocate, NewNumBuckets] = derived().planShrinkAndClear(); + destroyAll(); + if (!Reallocate) { + initEmpty(); + return; + } + derived().deallocateBuckets(); + derived().init(NewNumBuckets); + } + /// Return true if the specified key is in the map, false otherwise. [[nodiscard]] bool contains(const_arg_type_t<KeyT> Val) const { return doFind(Val) != nullptr; @@ -314,7 +325,7 @@ public: return false; // not in map. TheBucket->getSecond().~ValueT(); - TheBucket->getFirst() = getTombstoneKey(); + TheBucket->getFirst() = KeyInfoT::getTombstoneKey(); decrementNumEntries(); incrementNumTombstones(); return true; @@ -322,7 +333,7 @@ public: void erase(iterator I) { BucketT *TheBucket = &*I; TheBucket->getSecond().~ValueT(); - TheBucket->getFirst() = getTombstoneKey(); + TheBucket->getFirst() = KeyInfoT::getTombstoneKey(); decrementNumEntries(); incrementNumTombstones(); } @@ -353,10 +364,17 @@ protected: DenseMapBase() = default; void destroyAll() { + // No need to iterate through the buckets if both KeyT and ValueT are + // trivially destructible. + if constexpr (std::is_trivially_destructible_v<KeyT> && + std::is_trivially_destructible_v<ValueT>) + return; + if (getNumBuckets() == 0) // Nothing to do. return; - const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey(); + const KeyT EmptyKey = KeyInfoT::getEmptyKey(); + const KeyT TombstoneKey = KeyInfoT::getTombstoneKey(); for (BucketT &B : buckets()) { if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) && !KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) @@ -366,12 +384,14 @@ protected: } void initEmpty() { + static_assert(std::is_base_of_v<DenseMapBase, DerivedT>, + "Must pass the derived type to this template!"); setNumEntries(0); setNumTombstones(0); assert((getNumBuckets() & (getNumBuckets() - 1)) == 0 && "# initial buckets must be a power of two!"); - const KeyT EmptyKey = getEmptyKey(); + const KeyT EmptyKey = KeyInfoT::getEmptyKey(); for (BucketT &B : buckets()) ::new (&B.getFirst()) KeyT(EmptyKey); } @@ -391,8 +411,8 @@ protected: initEmpty(); // Insert all the old elements. - const KeyT EmptyKey = getEmptyKey(); - const KeyT TombstoneKey = getTombstoneKey(); + const KeyT EmptyKey = KeyInfoT::getEmptyKey(); + const KeyT TombstoneKey = KeyInfoT::getTombstoneKey(); for (BucketT &B : OldBuckets) { if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) && !KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) { @@ -412,9 +432,16 @@ protected: } } - template <typename OtherBaseT> - void copyFrom( - const DenseMapBase<OtherBaseT, KeyT, ValueT, KeyInfoT, BucketT> &other) { + void copyFrom(const DerivedT &other) { + this->destroyAll(); + derived().deallocateBuckets(); + setNumEntries(0); + setNumTombstones(0); + if (!derived().allocateBuckets(other.getNumBuckets())) { + // The bucket list is empty. No work to do. + return; + } + assert(&other != this); assert(getNumBuckets() == other.getNumBuckets()); @@ -429,8 +456,8 @@ protected: memcpy(reinterpret_cast<void *>(Buckets), OtherBuckets, NumBuckets * sizeof(BucketT)); } else { - const KeyT EmptyKey = getEmptyKey(); - const KeyT TombstoneKey = getTombstoneKey(); + const KeyT EmptyKey = KeyInfoT::getEmptyKey(); + const KeyT TombstoneKey = KeyInfoT::getTombstoneKey(); for (size_t I = 0; I < NumBuckets; ++I) { ::new (&Buckets[I].getFirst()) KeyT(OtherBuckets[I].getFirst()); if (!KeyInfoT::isEqual(Buckets[I].getFirst(), EmptyKey) && @@ -440,19 +467,6 @@ protected: } } - template <typename LookupKeyT> - static unsigned getHashValue(const LookupKeyT &Val) { - return KeyInfoT::getHashValue(Val); - } - - static const KeyT getEmptyKey() { - static_assert(std::is_base_of_v<DenseMapBase, DerivedT>, - "Must pass the derived type to this template!"); - return KeyInfoT::getEmptyKey(); - } - - static const KeyT getTombstoneKey() { return KeyInfoT::getTombstoneKey(); } - private: DerivedT &derived() { return *static_cast<DerivedT *>(this); } const DerivedT &derived() const { @@ -526,8 +540,6 @@ private: void grow(unsigned AtLeast) { derived().grow(AtLeast); } - void shrink_and_clear() { derived().shrink_and_clear(); } - template <typename LookupKeyT> BucketT *findBucketForInsertion(const LookupKeyT &Lookup, BucketT *TheBucket) { @@ -560,7 +572,7 @@ private: incrementNumEntries(); // If we are writing over a tombstone, remember this. - const KeyT EmptyKey = getEmptyKey(); + const KeyT EmptyKey = KeyInfoT::getEmptyKey(); if (!KeyInfoT::isEqual(TheBucket->getFirst(), EmptyKey)) decrementNumTombstones(); @@ -574,8 +586,8 @@ private: if (NumBuckets == 0) return nullptr; - const KeyT EmptyKey = getEmptyKey(); - unsigned BucketNo = getHashValue(Val) & (NumBuckets - 1); + const KeyT EmptyKey = KeyInfoT::getEmptyKey(); + unsigned BucketNo = KeyInfoT::getHashValue(Val) & (NumBuckets - 1); unsigned ProbeAmt = 1; while (true) { const BucketT *Bucket = BucketsPtr + BucketNo; @@ -612,13 +624,13 @@ private: // FoundTombstone - Keep track of whether we find a tombstone while probing. BucketT *FoundTombstone = nullptr; - const KeyT EmptyKey = getEmptyKey(); - const KeyT TombstoneKey = getTombstoneKey(); + const KeyT EmptyKey = KeyInfoT::getEmptyKey(); + const KeyT TombstoneKey = KeyInfoT::getTombstoneKey(); assert(!KeyInfoT::isEqual(Val, EmptyKey) && !KeyInfoT::isEqual(Val, TombstoneKey) && "Empty/Tombstone value shouldn't be inserted into map!"); - unsigned BucketNo = getHashValue(Val) & (NumBuckets - 1); + unsigned BucketNo = KeyInfoT::getHashValue(Val) & (NumBuckets - 1); unsigned ProbeAmt = 1; while (true) { BucketT *ThisBucket = BucketsPtr + BucketNo; @@ -719,7 +731,7 @@ public: DenseMap(const DenseMap &other) : BaseT() { init(0); - copyFrom(other); + this->copyFrom(other); } DenseMap(DenseMap &&other) : BaseT() { @@ -755,7 +767,7 @@ public: DenseMap &operator=(const DenseMap &other) { if (&other != this) - copyFrom(other); + this->copyFrom(other); return *this; } @@ -767,56 +779,6 @@ public: return *this; } - void copyFrom(const DenseMap &other) { - this->destroyAll(); - deallocateBuckets(); - if (allocateBuckets(other.NumBuckets)) { - this->BaseT::copyFrom(other); - } else { - NumEntries = 0; - NumTombstones = 0; - } - } - - void grow(unsigned AtLeast) { - unsigned OldNumBuckets = NumBuckets; - BucketT *OldBuckets = Buckets; - - allocateBuckets(std::max<unsigned>( - 64, static_cast<unsigned>(NextPowerOf2(AtLeast - 1)))); - assert(Buckets); - if (!OldBuckets) { - this->BaseT::initEmpty(); - return; - } - - this->moveFromOldBuckets( - llvm::make_range(OldBuckets, OldBuckets + OldNumBuckets)); - - // Free the old table. - deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets, - alignof(BucketT)); - } - - void shrink_and_clear() { - unsigned OldNumBuckets = NumBuckets; - unsigned OldNumEntries = NumEntries; - this->destroyAll(); - - // Reduce the number of buckets. - unsigned NewNumBuckets = 0; - if (OldNumEntries) - NewNumBuckets = std::max(64, 1 << (Log2_32_Ceil(OldNumEntries) + 1)); - if (NewNumBuckets == NumBuckets) { - this->BaseT::initEmpty(); - return; - } - - deallocate_buffer(Buckets, sizeof(BucketT) * OldNumBuckets, - alignof(BucketT)); - init(NewNumBuckets); - } - private: unsigned getNumEntries() const { return NumEntries; } @@ -855,6 +817,38 @@ private: NumTombstones = 0; } } + + void grow(unsigned AtLeast) { + unsigned OldNumBuckets = NumBuckets; + BucketT *OldBuckets = Buckets; + + allocateBuckets(std::max<unsigned>( + 64, static_cast<unsigned>(NextPowerOf2(AtLeast - 1)))); + assert(Buckets); + if (!OldBuckets) { + this->BaseT::initEmpty(); + return; + } + + this->moveFromOldBuckets( + llvm::make_range(OldBuckets, OldBuckets + OldNumBuckets)); + + // Free the old table. + deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets, + alignof(BucketT)); + } + + // Plan how to shrink the bucket table. Return: + // - {false, 0} to reuse the existing bucket table + // - {true, N} to reallocate a bucket table with N entries + std::pair<bool, unsigned> planShrinkAndClear() const { + unsigned NewNumBuckets = 0; + if (NumEntries) + NewNumBuckets = std::max(64u, 1u << (Log2_32_Ceil(NumEntries) + 1)); + if (NewNumBuckets == NumBuckets) + return {false, 0}; // Reuse. + return {true, NewNumBuckets}; // Reallocate. + } }; template <typename KeyT, typename ValueT, unsigned InlineBuckets = 4, @@ -896,7 +890,7 @@ public: SmallDenseMap(const SmallDenseMap &other) : BaseT() { init(0); - copyFrom(other); + this->copyFrom(other); } SmallDenseMap(SmallDenseMap &&other) : BaseT() { @@ -928,8 +922,8 @@ public: NumEntries = TmpNumEntries; std::swap(NumTombstones, RHS.NumTombstones); - const KeyT EmptyKey = this->getEmptyKey(); - const KeyT TombstoneKey = this->getTombstoneKey(); + const KeyT EmptyKey = KeyInfoT::getEmptyKey(); + const KeyT TombstoneKey = KeyInfoT::getTombstoneKey(); if (Small && RHS.Small) { // If we're swapping inline bucket arrays, we have to cope with some of // the tricky bits of DenseMap's storage system: the buckets are not @@ -995,7 +989,7 @@ public: SmallDenseMap &operator=(const SmallDenseMap &other) { if (&other != this) - copyFrom(other); + this->copyFrom(other); return *this; } @@ -1007,86 +1001,6 @@ public: return *this; } - void copyFrom(const SmallDenseMap &other) { - this->destroyAll(); - deallocateBuckets(); - allocateBuckets(other.getNumBuckets()); - this->BaseT::copyFrom(other); - } - - void init(unsigned InitNumEntries) { - auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries); - allocateBuckets(InitBuckets); - this->BaseT::initEmpty(); - } - - void grow(unsigned AtLeast) { - if (AtLeast > InlineBuckets) - AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast - 1)); - - if (Small) { - // First move the inline buckets into a temporary storage. - AlignedCharArrayUnion<BucketT[InlineBuckets]> TmpStorage; - BucketT *TmpBegin = reinterpret_cast<BucketT *>(&TmpStorage); - BucketT *TmpEnd = TmpBegin; - - // Loop over the buckets, moving non-empty, non-tombstones into the - // temporary storage. Have the loop move the TmpEnd forward as it goes. - const KeyT EmptyKey = this->getEmptyKey(); - const KeyT TombstoneKey = this->getTombstoneKey(); - for (BucketT &B : inlineBuckets()) { - if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) && - !KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) { - assert(size_t(TmpEnd - TmpBegin) < InlineBuckets && - "Too many inline buckets!"); - ::new (&TmpEnd->getFirst()) KeyT(std::move(B.getFirst())); - ::new (&TmpEnd->getSecond()) ValueT(std::move(B.getSecond())); - ++TmpEnd; - B.getSecond().~ValueT(); - } - B.getFirst().~KeyT(); - } - - // AtLeast == InlineBuckets can happen if there are many tombstones, - // and grow() is used to remove them. Usually we always switch to the - // large rep here. - allocateBuckets(AtLeast); - this->moveFromOldBuckets(llvm::make_range(TmpBegin, TmpEnd)); - return; - } - - LargeRep OldRep = std::move(*getLargeRep()); - getLargeRep()->~LargeRep(); - allocateBuckets(AtLeast); - - this->moveFromOldBuckets(OldRep.buckets()); - - // Free the old table. - deallocate_buffer(OldRep.Buckets, sizeof(BucketT) * OldRep.NumBuckets, - alignof(BucketT)); - } - - void shrink_and_clear() { - unsigned OldSize = this->size(); - this->destroyAll(); - - // Reduce the number of buckets. - unsigned NewNumBuckets = 0; - if (OldSize) { - NewNumBuckets = 1 << (Log2_32_Ceil(OldSize) + 1); - if (NewNumBuckets > InlineBuckets && NewNumBuckets < 64u) - NewNumBuckets = 64; - } - if ((Small && NewNumBuckets <= InlineBuckets) || - (!Small && NewNumBuckets == getLargeRep()->NumBuckets)) { - this->BaseT::initEmpty(); - return; - } - - deallocateBuckets(); - init(NewNumBuckets); - } - private: unsigned getNumEntries() const { return NumEntries; } @@ -1152,7 +1066,7 @@ private: getLargeRep()->~LargeRep(); } - void allocateBuckets(unsigned Num) { + bool allocateBuckets(unsigned Num) { if (Num <= InlineBuckets) { Small = true; } else { @@ -1161,6 +1075,76 @@ private: allocate_buffer(sizeof(BucketT) * Num, alignof(BucketT))); new (getLargeRep()) LargeRep{NewBuckets, Num}; } + return true; + } + + void init(unsigned InitNumEntries) { + auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries); + allocateBuckets(InitBuckets); + this->BaseT::initEmpty(); + } + + void grow(unsigned AtLeast) { + if (AtLeast > InlineBuckets) + AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast - 1)); + + if (Small) { + // First move the inline buckets into a temporary storage. + AlignedCharArrayUnion<BucketT[InlineBuckets]> TmpStorage; + BucketT *TmpBegin = reinterpret_cast<BucketT *>(&TmpStorage); + BucketT *TmpEnd = TmpBegin; + + // Loop over the buckets, moving non-empty, non-tombstones into the + // temporary storage. Have the loop move the TmpEnd forward as it goes. + const KeyT EmptyKey = KeyInfoT::getEmptyKey(); + const KeyT TombstoneKey = KeyInfoT::getTombstoneKey(); + for (BucketT &B : inlineBuckets()) { + if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) && + !KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) { + assert(size_t(TmpEnd - TmpBegin) < InlineBuckets && + "Too many inline buckets!"); + ::new (&TmpEnd->getFirst()) KeyT(std::move(B.getFirst())); + ::new (&TmpEnd->getSecond()) ValueT(std::move(B.getSecond())); + ++TmpEnd; + B.getSecond().~ValueT(); + } + B.getFirst().~KeyT(); + } + + // AtLeast == InlineBuckets can happen if there are many tombstones, + // and grow() is used to remove them. Usually we always switch to the + // large rep here. + allocateBuckets(AtLeast); + this->moveFromOldBuckets(llvm::make_range(TmpBegin, TmpEnd)); + return; + } + + LargeRep OldRep = std::move(*getLargeRep()); + getLargeRep()->~LargeRep(); + allocateBuckets(AtLeast); + + this->moveFromOldBuckets(OldRep.buckets()); + + // Free the old table. + deallocate_buffer(OldRep.Buckets, sizeof(BucketT) * OldRep.NumBuckets, + alignof(BucketT)); + } + + // Plan how to shrink the bucket table. Return: + // - {false, 0} to reuse the existing bucket table + // - {true, N} to reallocate a bucket table with N entries + std::pair<bool, unsigned> planShrinkAndClear() const { + unsigned NewNumBuckets = 0; + if (!this->empty()) { + NewNumBuckets = 1u << (Log2_32_Ceil(this->size()) + 1); + if (NewNumBuckets > InlineBuckets) + NewNumBuckets = std::max(64u, NewNumBuckets); + } + bool Reuse = Small ? NewNumBuckets <= InlineBuckets + : NewNumBuckets == getLargeRep()->NumBuckets; + if (Reuse) + return {false, 0}; // Reuse. + return {true, NewNumBuckets}; // Reallocate. } }; diff --git a/llvm/include/llvm/Support/Recycler.h b/llvm/include/llvm/Support/Recycler.h index b51c586..6502a70 100644 --- a/llvm/include/llvm/Support/Recycler.h +++ b/llvm/include/llvm/Support/Recycler.h @@ -19,6 +19,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include <cassert> +#include <type_traits> namespace llvm { @@ -72,19 +73,19 @@ public: /// deleted; calling clear is one way to ensure this. template<class AllocatorType> void clear(AllocatorType &Allocator) { - while (FreeList) { - T *t = reinterpret_cast<T *>(pop_val()); - Allocator.Deallocate(t, Size, Align); + if constexpr (std::is_same_v<std::decay_t<AllocatorType>, + BumpPtrAllocator>) { + // For BumpPtrAllocator, Deallocate is a no-op, so just drop the free + // list. + FreeList = nullptr; + } else { + while (FreeList) { + T *t = reinterpret_cast<T *>(pop_val()); + Allocator.Deallocate(t, Size, Align); + } } } - /// Special case for BumpPtrAllocator which has an empty Deallocate() - /// function. - /// - /// There is no need to traverse the free list, pulling all the objects into - /// cache. - void clear(BumpPtrAllocator &) { FreeList = nullptr; } - template<class SubClass, class AllocatorType> SubClass *Allocate(AllocatorType &Allocator) { static_assert(alignof(SubClass) <= Align, diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h index a235975..860f73c 100644 --- a/llvm/include/llvm/Support/SpecialCaseList.h +++ b/llvm/include/llvm/Support/SpecialCaseList.h @@ -167,8 +167,9 @@ private: std::vector<GlobMatcher::Glob> Globs; RadixTree<iterator_range<StringRef::const_iterator>, - SmallVector<const GlobMatcher::Glob *, 1>> - PrefixToGlob; + RadixTree<iterator_range<StringRef::const_reverse_iterator>, + SmallVector<const GlobMatcher::Glob *, 1>>> + PrefixSuffixToGlob; }; /// Represents a set of patterns and their line numbers diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 178529f..52c43a4 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -9084,6 +9084,8 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) { SmallVector<Register, 32> BuildVec; LLT EltTy = DstTy.getScalarType(); + DenseMap<unsigned, Register> CachedExtract; + for (int Idx : Mask) { if (Idx < 0) { if (!Undef.isValid()) @@ -9097,9 +9099,13 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) { int NumElts = Src0Ty.getNumElements(); Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg; int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts; - auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx); - auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK); - BuildVec.push_back(Extract.getReg(0)); + auto [It, Inserted] = CachedExtract.try_emplace(Idx); + if (Inserted) { + auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx); + It->second = + MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK).getReg(0); + } + BuildVec.push_back(It->second); } assert(DstTy.isVector() && "Unexpected scalar G_SHUFFLE_VECTOR"); diff --git a/llvm/lib/CodeGen/RegisterUsageInfo.cpp b/llvm/lib/CodeGen/RegisterUsageInfo.cpp index 7a4628a..2ef380f 100644 --- a/llvm/lib/CodeGen/RegisterUsageInfo.cpp +++ b/llvm/lib/CodeGen/RegisterUsageInfo.cpp @@ -44,7 +44,7 @@ void PhysicalRegisterUsageInfo::setTargetMachine(const TargetMachine &TM) { } bool PhysicalRegisterUsageInfo::doInitialization(Module &M) { - RegMasks.grow(M.size()); + RegMasks.reserve(M.size()); return false; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index dee0909..a522650 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -2015,9 +2015,9 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { Register InReg = FuncInfo.InitializeRegForValue(Inst); std::optional<CallingConv::ID> CallConv; - auto *CI = dyn_cast<CallInst>(Inst); - if (CI && !CI->isInlineAsm()) - CallConv = CI->getCallingConv(); + auto *CB = dyn_cast<CallBase>(Inst); + if (CB && !CB->isInlineAsm()) + CallConv = CB->getCallingConv(); RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg, Inst->getType(), CallConv); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 920dff9..da4e409 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9899,6 +9899,18 @@ SDValue TargetLowering::expandBSWAP(SDNode *N, SelectionDAG &DAG) const { // Use a rotate by 8. This can be further expanded if necessary. return DAG.getNode(ISD::ROTL, dl, VT, Op, DAG.getConstant(8, dl, SHVT)); case MVT::i32: + // This is meant for ARM speficially, which has ROTR but no ROTL. + if (isOperationLegalOrCustom(ISD::ROTR, VT)) { + SDValue Mask = DAG.getConstant(0x00FF00FF, dl, VT); + // (x & 0x00FF00FF) rotr 8 | (x rotl 8) & 0x00FF00FF + SDValue And = DAG.getNode(ISD::AND, dl, VT, Op, Mask); + SDValue Rotr = + DAG.getNode(ISD::ROTR, dl, VT, And, DAG.getConstant(8, dl, SHVT)); + SDValue Rotl = + DAG.getNode(ISD::ROTR, dl, VT, Op, DAG.getConstant(24, dl, SHVT)); + SDValue And2 = DAG.getNode(ISD::AND, dl, VT, Rotl, Mask); + return DAG.getNode(ISD::OR, dl, VT, Rotr, And2); + } Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT)); Tmp3 = DAG.getNode(ISD::AND, dl, VT, Op, DAG.getConstant(0xFF00, dl, VT)); diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp index 049efe8..c33bf8b 100644 --- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp +++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp @@ -44,7 +44,7 @@ bool TargetOptions::FramePointerIsReserved(const MachineFunction &MF) const { return false; return StringSwitch<bool>(FPAttr.getValueAsString()) - .Cases("all", "non-leaf", "reserved", true) + .Cases({"all", "non-leaf", "reserved"}, true) .Case("none", false); } diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp index d4901d9..8a8f111 100644 --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -5844,11 +5844,11 @@ bool MasmParser::lookUpField(const StructInfo &Structure, StringRef Member, bool MasmParser::lookUpType(StringRef Name, AsmTypeInfo &Info) const { unsigned Size = StringSwitch<unsigned>(Name) - .CasesLower("byte", "db", "sbyte", 1) - .CasesLower("word", "dw", "sword", 2) - .CasesLower("dword", "dd", "sdword", 4) - .CasesLower("fword", "df", 6) - .CasesLower("qword", "dq", "sqword", 8) + .CasesLower({"byte", "db", "sbyte"}, 1) + .CasesLower({"word", "dw", "sword"}, 2) + .CasesLower({"dword", "dd", "sdword"}, 4) + .CasesLower({"fword", "df"}, 6) + .CasesLower({"qword", "dq", "sqword"}, 8) .CaseLower("real4", 4) .CaseLower("real8", 8) .CaseLower("real10", 10) diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp index c27f627..3a97185 100644 --- a/llvm/lib/Support/SpecialCaseList.cpp +++ b/llvm/lib/Support/SpecialCaseList.cpp @@ -92,8 +92,10 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) { for (const auto &G : reverse(Globs)) { StringRef Prefix = G.Pattern.prefix(); + StringRef Suffix = G.Pattern.suffix(); - auto &V = PrefixToGlob.emplace(Prefix).first->second; + auto &SToGlob = PrefixSuffixToGlob.emplace(Prefix).first->second; + auto &V = SToGlob.emplace(reverse(Suffix)).first->second; V.emplace_back(&G); } } @@ -101,16 +103,18 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) { void SpecialCaseList::GlobMatcher::match( StringRef Query, llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const { - if (!PrefixToGlob.empty()) { - for (const auto &[_, V] : PrefixToGlob.find_prefixes(Query)) { - for (const auto *G : V) { - if (G->Pattern.match(Query)) { - Cb(G->Name, G->LineNo); - // As soon as we find a match in the vector, we can break for this - // vector, since the globs are already sorted by priority within the - // prefix group. However, we continue searching other prefix groups in - // the map, as they may contain a better match overall. - break; + if (!PrefixSuffixToGlob.empty()) { + for (const auto &[_, SToGlob] : PrefixSuffixToGlob.find_prefixes(Query)) { + for (const auto &[_, V] : SToGlob.find_prefixes(reverse(Query))) { + for (const auto *G : V) { + if (G->Pattern.match(Query)) { + Cb(G->Name, G->LineNo); + // As soon as we find a match in the vector, we can break for this + // vector, since the globs are already sorted by priority within the + // prefix group. However, we continue searching other prefix groups + // in the map, as they may contain a better match overall. + break; + } } } } diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index d1832f4..f680a5e 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -651,7 +651,7 @@ struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo { AArch64FunctionInfo(const llvm::AArch64FunctionInfo &MFI); void mappingImpl(yaml::IO &YamlIO) override; - ~AArch64FunctionInfo() = default; + ~AArch64FunctionInfo() override = default; }; template <> struct MappingTraits<AArch64FunctionInfo> { diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index 72a7676..47d76f3 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -154,7 +154,7 @@ public: bool shouldAnalyzePhysregInMachineLoopInfo(MCRegister R) const override; - virtual bool isIgnoredCVReg(MCRegister LLVMReg) const override; + bool isIgnoredCVReg(MCRegister LLVMReg) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 1b4b113..6bad4dbd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -131,7 +131,7 @@ protected: public: MetadataStreamerMsgPackV4() = default; - ~MetadataStreamerMsgPackV4() = default; + ~MetadataStreamerMsgPackV4() override = default; bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override; @@ -154,7 +154,7 @@ protected: public: MetadataStreamerMsgPackV5() = default; - ~MetadataStreamerMsgPackV5() = default; + ~MetadataStreamerMsgPackV5() override = default; }; class MetadataStreamerMsgPackV6 final : public MetadataStreamerMsgPackV5 { @@ -163,7 +163,7 @@ protected: public: MetadataStreamerMsgPackV6() = default; - ~MetadataStreamerMsgPackV6() = default; + ~MetadataStreamerMsgPackV6() override = default; void emitKernelAttrs(const AMDGPUTargetMachine &TM, const MachineFunction &MF, msgpack::MapDocNode Kern) override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h index c5c9473..0804133 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h @@ -26,20 +26,19 @@ struct PerFunctionMIParsingState; class AMDGPUMIRFormatter final : public MIRFormatter { public: AMDGPUMIRFormatter() = default; - virtual ~AMDGPUMIRFormatter() = default; + ~AMDGPUMIRFormatter() override = default; /// Implement target specific printing for machine operand immediate value, so /// that we can have more meaningful mnemonic than a 64-bit integer. Passing /// None to OpIdx means the index is unknown. - virtual void printImm(raw_ostream &OS, const MachineInstr &MI, - std::optional<unsigned> OpIdx, - int64_t Imm) const override; + void printImm(raw_ostream &OS, const MachineInstr &MI, + std::optional<unsigned> OpIdx, int64_t Imm) const override; /// Implement target specific parsing of immediate mnemonics. The mnemonic is /// a string with a leading dot. - virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx, - StringRef Src, int64_t &Imm, - ErrorCallbackType ErrorCallback) const override; + bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx, + StringRef Src, int64_t &Imm, + ErrorCallbackType ErrorCallback) const override; /// Implement target specific parsing of target custom pseudo source value. bool diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h index cbc7427..4d0c163 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h @@ -32,7 +32,7 @@ public: AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) : InstrPostProcess(STI, MCII) {} - ~AMDGPUInstrPostProcess() = default; + ~AMDGPUInstrPostProcess() override = default; void postProcessInstruction(Instruction &Inst, const MCInst &MCI) override; }; @@ -88,7 +88,7 @@ public: AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII); - ~AMDGPUCustomBehaviour() = default; + ~AMDGPUCustomBehaviour() override = default; /// This method is used to determine if an instruction /// should be allowed to be dispatched. The return value is /// how many cycles until the instruction can be dispatched. diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h index 54fcd2a..246a3f8 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h @@ -64,7 +64,7 @@ private: ArrayRef<const MCExpr *> Args; AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx); - ~AMDGPUMCExpr(); + ~AMDGPUMCExpr() override; bool evaluateExtraSGPRs(MCValue &Res, const MCAssembler *Asm) const; bool evaluateTotalNumVGPR(MCValue &Res, const MCAssembler *Asm) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 5fdedda..dc23a21 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1640,7 +1640,7 @@ public: unsigned *PredCost = nullptr) const override; InstructionUniformity - getInstructionUniformity(const MachineInstr &MI) const override final; + getInstructionUniformity(const MachineInstr &MI) const final; InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 2c1a13c..019c3b7 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -311,7 +311,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { const llvm::MachineFunction &MF); void mappingImpl(yaml::IO &YamlIO) override; - ~SIMachineFunctionInfo() = default; + ~SIMachineFunctionInfo() override = default; }; template <> struct MappingTraits<SIMachineFunctionInfo> { diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 07264d9..a177a42 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -640,7 +640,7 @@ public: bool finalizeStore(MachineInstr &MI, bool Atomic) const override; - virtual bool handleCooperativeAtomic(MachineInstr &MI) const override; + bool handleCooperativeAtomic(MachineInstr &MI) const override; bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index a9c4b53..72eb3d0 100644 --- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -312,7 +312,7 @@ struct ARMFunctionInfo final : public yaml::MachineFunctionInfo { ARMFunctionInfo(const llvm::ARMFunctionInfo &MFI); void mappingImpl(yaml::IO &YamlIO) override; - ~ARMFunctionInfo() = default; + ~ARMFunctionInfo() override = default; }; template <> struct MappingTraits<ARMFunctionInfo> { diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp index 3a840a3..5548ad1 100644 --- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp +++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp @@ -40,7 +40,7 @@ class AVRDisassembler : public MCDisassembler { public: AVRDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : MCDisassembler(STI, Ctx) {} - virtual ~AVRDisassembler() = default; + ~AVRDisassembler() override = default; DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp index 619efb3..03c60e8 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp @@ -24,7 +24,7 @@ class AVRELFObjectWriter : public MCELFObjectTargetWriter { public: AVRELFObjectWriter(uint8_t OSABI); - virtual ~AVRELFObjectWriter() = default; + ~AVRELFObjectWriter() override = default; unsigned getRelocType(const MCFixup &, const MCValue &, bool IsPCRel) const override; diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.h b/llvm/lib/Target/BPF/BPFAsmPrinter.h index 0cfb283..90ef207 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.h +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.h @@ -32,7 +32,7 @@ public: void emitInstruction(const MachineInstr *MI) override; MCSymbol *getJTPublicSymbol(unsigned JTI); - virtual void emitJumpTableInfo() override; + void emitJumpTableInfo() override; static char ID; diff --git a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp index e3c39a1..b12985d 100644 --- a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp +++ b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp @@ -46,7 +46,7 @@ class BPFCheckAndAdjustIR final : public ModulePass { public: static char ID; BPFCheckAndAdjustIR() : ModulePass(ID) {} - virtual void getAnalysisUsage(AnalysisUsage &AU) const override; + void getAnalysisUsage(AnalysisUsage &AU) const override; private: void checkIR(Module &M); diff --git a/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h b/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h index f3064c0c..af3542e 100644 --- a/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h +++ b/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h @@ -16,7 +16,7 @@ namespace llvm { class BPFTargetLoweringObjectFileELF : public TargetLoweringObjectFileELF { public: - virtual MCSection * + MCSection * getSectionForJumpTable(const Function &F, const TargetMachine &TM, const MachineJumpTableEntry *JTE) const override; }; diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp index 4aecaf1..8e791e6 100644 --- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp +++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp @@ -635,7 +635,7 @@ public: B.setChangeObserver(*this); } - ~InstManager() { B.stopObservingChanges(); } + ~InstManager() override { B.stopObservingChanges(); } void createdInstr(MachineInstr &MI) override { InstList.insert(&MI); } void erasingInstr(MachineInstr &MI) override {} diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h index 34efa0b..4ccd3cf 100644 --- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h +++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h @@ -33,7 +33,7 @@ public: explicit RISCVLMULInstrument(StringRef Data) : Instrument(DESC_NAME, Data) {} - ~RISCVLMULInstrument() = default; + ~RISCVLMULInstrument() override = default; uint8_t getLMUL() const; }; @@ -45,7 +45,7 @@ public: explicit RISCVSEWInstrument(StringRef Data) : Instrument(DESC_NAME, Data) {} - ~RISCVSEWInstrument() = default; + ~RISCVSEWInstrument() override = default; uint8_t getSEW() const; }; diff --git a/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h b/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h index b69904d..b2ce57a 100644 --- a/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h +++ b/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h @@ -37,7 +37,7 @@ private: RISCVCPKind Kind; public: - ~RISCVConstantPoolValue() = default; + ~RISCVConstantPoolValue() override = default; static RISCVConstantPoolValue *Create(const GlobalValue *GV); static RISCVConstantPoolValue *Create(LLVMContext &C, StringRef S); diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h index 4fa93f1..f9be80f 100644 --- a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h +++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h @@ -31,7 +31,7 @@ struct RISCVMachineFunctionInfo final : public yaml::MachineFunctionInfo { RISCVMachineFunctionInfo(const llvm::RISCVMachineFunctionInfo &MFI); void mappingImpl(yaml::IO &YamlIO) override; - ~RISCVMachineFunctionInfo() = default; + ~RISCVMachineFunctionInfo() override = default; }; template <> struct MappingTraits<RISCVMachineFunctionInfo> { diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp index 28a1690..6e444c9 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp @@ -347,7 +347,7 @@ class SPIRVLegalizePointerCast : public FunctionPass { public: SPIRVLegalizePointerCast(SPIRVTargetMachine *TM) : FunctionPass(ID), TM(TM) {} - virtual bool runOnFunction(Function &F) override { + bool runOnFunction(Function &F) override { const SPIRVSubtarget &ST = TM->getSubtarget<SPIRVSubtarget>(F); GR = ST.getSPIRVGlobalRegistry(); DeadInstructions.clear(); diff --git a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp index 60d39c9..aba9cf7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp @@ -234,7 +234,7 @@ public: } #endif - virtual bool runOnFunction(Function &F) override { + bool runOnFunction(Function &F) override { LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto *TopLevelRegion = getAnalysis<SPIRVConvergenceRegionAnalysisWrapperPass>() diff --git a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp index e621bcd44..b1a8d1a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp @@ -28,7 +28,7 @@ public: SPIRVStripConvergentIntrinsics() : FunctionPass(ID) {} - virtual bool runOnFunction(Function &F) override { + bool runOnFunction(Function &F) override { DenseSet<Instruction *> ToRemove; // Is the instruction is a convergent intrinsic, add it to kill-list and diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp index 5b149f8..ea634fb 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp @@ -1113,7 +1113,7 @@ public: SPIRVStructurizer() : FunctionPass(ID) {} - virtual bool runOnFunction(Function &F) override { + bool runOnFunction(Function &F) override { bool Modified = false; // In LLVM, Switches are allowed to have several cases branching to the same diff --git a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp index e585e5a..b4dadaa 100644 --- a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp +++ b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp @@ -36,7 +36,7 @@ class SparcDisassembler : public MCDisassembler { public: SparcDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : MCDisassembler(STI, Ctx) {} - virtual ~SparcDisassembler() = default; + ~SparcDisassembler() override = default; DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, diff --git a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h index 1ff6cc8..ba325b5 100644 --- a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h +++ b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h @@ -111,7 +111,7 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy { public: SystemZPostRASchedStrategy(const MachineSchedContext *C); - virtual ~SystemZPostRASchedStrategy(); + ~SystemZPostRASchedStrategy() override; /// Called for a region before scheduling. void initPolicy(MachineBasicBlock::iterator Begin, diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h index b1de145..bea8b9f 100644 --- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h @@ -101,7 +101,7 @@ public: int getStackPointerBias() final { return 2048; } /// Destroys the object. Bogus destructor overriding base class destructor - ~SystemZXPLINK64Registers() = default; + ~SystemZXPLINK64Registers() override = default; }; /// ELF calling convention specific use registers @@ -124,7 +124,7 @@ public: int getStackPointerBias() final { return 0; } /// Destroys the object. Bogus destructor overriding base class destructor - ~SystemZELFRegisters() = default; + ~SystemZELFRegisters() override = default; }; struct SystemZRegisterInfo : public SystemZGenRegisterInfo { diff --git a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp index aad826b..465e074 100644 --- a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp +++ b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp @@ -36,7 +36,7 @@ class VEDisassembler : public MCDisassembler { public: VEDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : MCDisassembler(STI, Ctx) {} - virtual ~VEDisassembler() = default; + ~VEDisassembler() override = default; DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index 45bbf12..9175b27 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -102,7 +102,7 @@ struct WebAssemblyOperand : public MCParsedAsmOperand { WebAssemblyOperand(SMLoc Start, SMLoc End, CaLOp C) : Kind(CatchList), StartLoc(Start), EndLoc(End), CaL(C) {} - ~WebAssemblyOperand() { + ~WebAssemblyOperand() override { if (isBrList()) BrL.~BrLOp(); if (isCatchList()) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h index 40ae4ae..ff4d6469 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h @@ -192,7 +192,7 @@ struct WebAssemblyFunctionInfo final : public yaml::MachineFunctionInfo { const llvm::WebAssemblyFunctionInfo &MFI); void mappingImpl(yaml::IO &YamlIO) override; - ~WebAssemblyFunctionInfo() = default; + ~WebAssemblyFunctionInfo() override = default; }; template <> struct MappingTraits<WebAssemblyFunctionInfo> { diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index ac251fd..127ee67 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -3533,10 +3533,10 @@ bool X86AsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, while (isLockRepeatNtPrefix(Name.lower())) { unsigned Prefix = StringSwitch<unsigned>(Name) - .Cases("lock", "lock", X86::IP_HAS_LOCK) - .Cases("rep", "repe", "repz", X86::IP_HAS_REPEAT) - .Cases("repne", "repnz", X86::IP_HAS_REPEAT_NE) - .Cases("notrack", "notrack", X86::IP_HAS_NOTRACK) + .Case("lock", X86::IP_HAS_LOCK) + .Cases({"rep", "repe", "repz"}, X86::IP_HAS_REPEAT) + .Cases({"repne", "repnz"}, X86::IP_HAS_REPEAT_NE) + .Case("notrack", X86::IP_HAS_NOTRACK) .Default(X86::IP_NO_PREFIX); // Invalid prefix (impossible) Flags |= Prefix; if (getLexer().is(AsmToken::EndOfStatement)) { diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h index d6197f3..05a1c22 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h @@ -37,7 +37,7 @@ public: X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) : InstrPostProcess(STI, MCII) {} - ~X86InstrPostProcess() = default; + ~X86InstrPostProcess() override = default; void postProcessInstruction(Instruction &Inst, const MCInst &MCI) override; }; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 1c06dc4..af5a698 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -491,7 +491,7 @@ namespace X86_MC { class X86MCInstrAnalysis : public MCInstrAnalysis { X86MCInstrAnalysis(const X86MCInstrAnalysis &) = delete; X86MCInstrAnalysis &operator=(const X86MCInstrAnalysis &) = delete; - virtual ~X86MCInstrAnalysis() = default; + ~X86MCInstrAnalysis() override = default; public: X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {} diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h index 5f974e5..1bda505 100644 --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -43,7 +43,7 @@ struct X86MachineFunctionInfo final : public yaml::MachineFunctionInfo { X86MachineFunctionInfo(const llvm::X86MachineFunctionInfo &MFI); void mappingImpl(yaml::IO &YamlIO) override; - ~X86MachineFunctionInfo() = default; + ~X86MachineFunctionInfo() override = default; }; template <> struct MappingTraits<X86MachineFunctionInfo> { diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index 077d29f..3b59ebb 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -272,6 +272,9 @@ AA::getInitialValueForObj(Attributor &A, const AbstractAttribute &QueryingAA, } if (RangePtr && !RangePtr->offsetOrSizeAreUnknown()) { + int64_t StorageSize = DL.getTypeStoreSize(&Ty); + if (StorageSize != RangePtr->Size) + return nullptr; APInt Offset = APInt(64, RangePtr->Offset); return ConstantFoldLoadFromConst(Initializer, &Ty, Offset, DL); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir index 423ce82..d5947c8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir @@ -90,9 +90,7 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) - ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY2]](s32) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 @@ -328,3 +326,30 @@ body: | $vgpr0_vgpr1 = COPY %4 ... + +--- +name: shufflevector_cache_element +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; CHECK-LABEL: name: shufflevector_cache_element + ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY2]](s32), [[COPY2]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY3]](s32), [[COPY3]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<8 x s32>), [[COPY1]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[EVEC]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(<2 x s32>) = G_BUILD_VECTOR %0, %1 + %3:_(<8 x s32>) = G_SHUFFLE_VECTOR %2, %2, shufflemask(0, 0, 0, 0, 3, 3, 3, 3) + %4:_(s32) = G_EXTRACT_VECTOR_ELT %3(<8 x s32>), %1(s32) + $vgpr0 = COPY %4 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir index 10f425c..dff8111 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir @@ -237,16 +237,13 @@ body: | ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]] ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) ; ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_0_0 ; GFX9: liveins: $vgpr0, $vgpr1 @@ -257,10 +254,7 @@ body: | ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 @@ -285,12 +279,10 @@ body: | ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32) + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]] - ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) ; ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_1_1 ; GFX9: liveins: $vgpr0, $vgpr1 @@ -301,10 +293,7 @@ body: | ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 @@ -329,16 +318,13 @@ body: | ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]] ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) ; ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_2_2 ; GFX9: liveins: $vgpr0, $vgpr1 @@ -349,10 +335,7 @@ body: | ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 @@ -641,12 +624,10 @@ body: | ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32) + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]] - ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) ; ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_3_3 ; GFX9: liveins: $vgpr0, $vgpr1 @@ -657,10 +638,7 @@ body: | ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 @@ -733,16 +711,13 @@ body: | ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]] ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) ; ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_2_0 ; GFX9: liveins: $vgpr0, $vgpr1 @@ -753,10 +728,7 @@ body: | ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 diff --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll index 4b6d14e..1d5c858 100644 --- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll +++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll @@ -53,14 +53,12 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) { ; BSWAP is not supported by 32 bit target ; CHECK-LABEL: load_i32_by_i8_bswap: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: orr r1, r1, #16711680 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap: @@ -223,21 +221,16 @@ define i32 @load_i32_by_i16_i8(ptr %arg) { define i64 @load_i64_by_i8_bswap(ptr %arg) { ; CHECK-LABEL: load_i64_by_i8_bswap: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r2, #255 ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: mov r12, #65280 ; CHECK-NEXT: ldr r0, [r0, #4] -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r3, r12, r0, lsr #8 -; CHECK-NEXT: orr r3, r3, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: and r2, r12, r1, lsr #8 -; CHECK-NEXT: orr r0, r0, r3 -; CHECK-NEXT: and r3, r1, #65280 -; CHECK-NEXT: orr r2, r2, r1, lsr #24 -; CHECK-NEXT: lsl r1, r1, #24 -; CHECK-NEXT: orr r1, r1, r3, lsl #8 -; CHECK-NEXT: orr r1, r1, r2 +; CHECK-NEXT: orr r2, r2, #16711680 +; CHECK-NEXT: and r3, r0, r2 +; CHECK-NEXT: and r0, r2, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r3, ror #8 +; CHECK-NEXT: and r3, r1, r2 +; CHECK-NEXT: and r1, r2, r1, ror #24 +; CHECK-NEXT: orr r1, r1, r3, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap: @@ -377,14 +370,12 @@ define i64 @load_i64_by_i8(ptr %arg) { define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0, #1] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: orr r1, r1, #16711680 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset: @@ -434,14 +425,12 @@ define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) { define i32 @load_i32_by_i8_neg_offset(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0, #-4] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: orr r1, r1, #16711680 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset: @@ -587,14 +576,12 @@ declare i16 @llvm.bswap.i16(i16) define i32 @load_i32_by_bswap_i16(ptr %arg) { ; CHECK-LABEL: load_i32_by_bswap_i16: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: orr r1, r1, #16711680 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16: @@ -667,14 +654,12 @@ define i32 @load_i32_by_i8_base_offset_index(ptr %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index: ; CHECK: @ %bb.0: ; CHECK-NEXT: add r0, r0, r1 -; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r1, #255 +; CHECK-NEXT: orr r1, r1, #16711680 ; CHECK-NEXT: ldr r0, [r0, #12] -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index: @@ -733,14 +718,12 @@ define i32 @load_i32_by_i8_base_offset_index_2(ptr %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK: @ %bb.0: ; CHECK-NEXT: add r0, r1, r0 -; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r1, #255 +; CHECK-NEXT: orr r1, r1, #16711680 ; CHECK-NEXT: ldr r0, [r0, #13] -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2: diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll index 0f6ec8a..7087367 100644 --- a/llvm/test/CodeGen/ARM/load-combine.ll +++ b/llvm/test/CodeGen/ARM/load-combine.ll @@ -117,14 +117,12 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) { ; BSWAP is not supported by 32 bit target ; CHECK-LABEL: load_i32_by_i8_bswap: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: orr r1, r1, #16711680 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap: @@ -237,21 +235,16 @@ define i64 @load_i64_by_i8(ptr %arg) { define i64 @load_i64_by_i8_bswap(ptr %arg) { ; CHECK-LABEL: load_i64_by_i8_bswap: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r2, #255 ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: mov r12, #65280 ; CHECK-NEXT: ldr r0, [r0, #4] -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r3, r12, r0, lsr #8 -; CHECK-NEXT: orr r3, r3, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: and r2, r12, r1, lsr #8 -; CHECK-NEXT: orr r0, r0, r3 -; CHECK-NEXT: and r3, r1, #65280 -; CHECK-NEXT: orr r2, r2, r1, lsr #24 -; CHECK-NEXT: lsl r1, r1, #24 -; CHECK-NEXT: orr r1, r1, r3, lsl #8 -; CHECK-NEXT: orr r1, r1, r2 +; CHECK-NEXT: orr r2, r2, #16711680 +; CHECK-NEXT: and r3, r0, r2 +; CHECK-NEXT: and r0, r2, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r3, ror #8 +; CHECK-NEXT: and r3, r1, r2 +; CHECK-NEXT: and r1, r2, r1, ror #24 +; CHECK-NEXT: orr r1, r1, r3, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap: @@ -413,14 +406,12 @@ define i32 @load_i32_by_i8_neg_offset(ptr %arg) { define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0, #1] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: orr r1, r1, #16711680 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap: @@ -469,14 +460,12 @@ define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) { define i32 @load_i32_by_i8_neg_offset_bswap(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0, #-4] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: orr r1, r1, #16711680 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap: @@ -527,14 +516,12 @@ declare i16 @llvm.bswap.i16(i16) define i32 @load_i32_by_bswap_i16(ptr %arg) { ; CHECK-LABEL: load_i32_by_bswap_i16: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: orr r1, r1, #16711680 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16: diff --git a/llvm/test/Transforms/Attributor/range-and-constant-fold.ll b/llvm/test/Transforms/Attributor/range-and-constant-fold.ll new file mode 100644 index 0000000..a8f3309 --- /dev/null +++ b/llvm/test/Transforms/Attributor/range-and-constant-fold.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=attributor %s -o - | FileCheck %s + +@g = internal unnamed_addr addrspace(4) constant [3 x i8] c"12\00", align 16 + +define void @foo(i32 %a, i32 %b, ptr %p.0, ptr %p.1) { +; CHECK-LABEL: define void @foo( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], ptr nofree nonnull writeonly captures(none) dereferenceable(1) [[P_0:%.*]], ptr nofree nonnull writeonly align 4 captures(none) dereferenceable(8) [[P_1:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[CMP]], label %[[L1:.*]], label %[[L2:.*]] +; CHECK: [[L1]]: +; CHECK-NEXT: br label %[[L3:.*]] +; CHECK: [[L2]]: +; CHECK-NEXT: br label %[[L3]] +; CHECK: [[L3]]: +; CHECK-NEXT: [[PHI:%.*]] = phi ptr addrspace(4) [ @g, %[[L1]] ], [ getelementptr inbounds nuw (i8, ptr addrspace(4) @g, i64 1), %[[L2]] ] +; CHECK-NEXT: [[LOAD_SMALL:%.*]] = load i8, ptr addrspace(4) [[PHI]], align 4 +; CHECK-NEXT: store i8 [[LOAD_SMALL]], ptr [[P_0]], align 1 +; CHECK-NEXT: [[LOAD_LARGE:%.*]] = load i64, ptr addrspace(4) [[PHI]], align 4 +; CHECK-NEXT: store i64 [[LOAD_LARGE]], ptr [[P_1]], align 4 +; CHECK-NEXT: ret void +; +entry: + %cmp = icmp ne i32 %a, %b + br i1 %cmp, label %l1, label %l2 +l1: + br label %l3 +l2: + br label %l3 +l3: + %phi = phi ptr addrspace(4) [ @g, %l1 ], [ getelementptr inbounds nuw (i8, ptr addrspace(4) @g, i64 1), %l2 ] + %load.small = load i8, ptr addrspace(4) %phi + store i8 %load.small, ptr %p.0 + %load.large = load i64, ptr addrspace(4) %phi + store i64 %load.large, ptr %p.1 + ret void +} diff --git a/llvm/tools/llvm-cov/llvm-cov.cpp b/llvm/tools/llvm-cov/llvm-cov.cpp index 5ada5578..52610eff 100644 --- a/llvm/tools/llvm-cov/llvm-cov.cpp +++ b/llvm/tools/llvm-cov/llvm-cov.cpp @@ -71,8 +71,8 @@ int main(int argc, const char **argv) { .Case("gcov", gcovMain) .Case("report", reportMain) .Case("show", showMain) - .Cases("-h", "-help", "--help", helpMain) - .Cases("-version", "--version", versionMain) + .Cases({"-h", "-help", "--help"}, helpMain) + .Cases({"-version", "--version"}, versionMain) .Default(nullptr); if (Func) { diff --git a/llvm/unittests/ADT/DenseMapTest.cpp b/llvm/unittests/ADT/DenseMapTest.cpp index 50e9c6e..aceb4f3 100644 --- a/llvm/unittests/ADT/DenseMapTest.cpp +++ b/llvm/unittests/ADT/DenseMapTest.cpp @@ -70,6 +70,16 @@ public: int getValue() const { return Value; } bool operator==(const CtorTester &RHS) const { return Value == RHS.Value; } + + // Return the number of live CtorTester objects, excluding the empty and + // tombstone keys. + static size_t getNumConstructed() { + return std::count_if(Constructed.begin(), Constructed.end(), + [](const CtorTester *Obj) { + int V = Obj->getValue(); + return V != -1 && V != -2; + }); + } }; std::set<CtorTester *> CtorTester::Constructed; @@ -1031,4 +1041,64 @@ TEST(SmallDenseMapCustomTest, InitSize) { } } +TEST(DenseMapCustomTest, KeyDtor) { + // This test relies on CtorTester being non-trivially destructible. + static_assert(!std::is_trivially_destructible_v<CtorTester>, + "CtorTester must not be trivially destructible"); + + // Test that keys are destructed on scope exit. + EXPECT_EQ(0u, CtorTester::getNumConstructed()); + { + DenseMap<CtorTester, int, CtorTesterMapInfo> Map; + Map.try_emplace(CtorTester(0), 1); + Map.try_emplace(CtorTester(1), 2); + EXPECT_EQ(2u, CtorTester::getNumConstructed()); + } + EXPECT_EQ(0u, CtorTester::getNumConstructed()); + + // Test that keys are destructed on erase and shrink_and_clear. + EXPECT_EQ(0u, CtorTester::getNumConstructed()); + { + DenseMap<CtorTester, int, CtorTesterMapInfo> Map; + Map.try_emplace(CtorTester(0), 1); + Map.try_emplace(CtorTester(1), 2); + EXPECT_EQ(2u, CtorTester::getNumConstructed()); + Map.erase(CtorTester(1)); + EXPECT_EQ(1u, CtorTester::getNumConstructed()); + Map.shrink_and_clear(); + EXPECT_EQ(0u, CtorTester::getNumConstructed()); + } + EXPECT_EQ(0u, CtorTester::getNumConstructed()); +} + +TEST(DenseMapCustomTest, ValueDtor) { + // This test relies on CtorTester being non-trivially destructible. + static_assert(!std::is_trivially_destructible_v<CtorTester>, + "CtorTester must not be trivially destructible"); + + // Test that values are destructed on scope exit. + EXPECT_EQ(0u, CtorTester::getNumConstructed()); + { + DenseMap<int, CtorTester> Map; + Map.try_emplace(0, CtorTester(1)); + Map.try_emplace(1, CtorTester(2)); + EXPECT_EQ(2u, CtorTester::getNumConstructed()); + } + EXPECT_EQ(0u, CtorTester::getNumConstructed()); + + // Test that values are destructed on erase and shrink_and_clear. + EXPECT_EQ(0u, CtorTester::getNumConstructed()); + { + DenseMap<int, CtorTester> Map; + Map.try_emplace(0, CtorTester(1)); + Map.try_emplace(1, CtorTester(2)); + EXPECT_EQ(2u, CtorTester::getNumConstructed()); + Map.erase(1); + EXPECT_EQ(1u, CtorTester::getNumConstructed()); + Map.shrink_and_clear(); + EXPECT_EQ(0u, CtorTester::getNumConstructed()); + } + EXPECT_EQ(0u, CtorTester::getNumConstructed()); +} + } // namespace diff --git a/llvm/unittests/ADT/StringSwitchTest.cpp b/llvm/unittests/ADT/StringSwitchTest.cpp index d88a0ff..c94feb5 100644 --- a/llvm/unittests/ADT/StringSwitchTest.cpp +++ b/llvm/unittests/ADT/StringSwitchTest.cpp @@ -157,7 +157,7 @@ TEST(StringSwitchTest, Cases) { auto Translate = [](StringRef S) { return llvm::StringSwitch<OSType>(S) - .Cases(StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt", + .Cases({StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt"}, OSType::Windows) .Cases({"linux", "unix", "*nix", "posix"}, OSType::Linux) .Cases({"macos", "osx"}, OSType::MacOS) @@ -189,7 +189,7 @@ TEST(StringSwitchTest, CasesLower) { auto Translate = [](StringRef S) { return llvm::StringSwitch<OSType>(S) - .CasesLower(StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt", + .CasesLower({StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt"}, OSType::Windows) .CasesLower({"linux", "unix", "*nix", "posix"}, OSType::Linux) .CasesLower({"macos", "osx"}, OSType::MacOS) diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt index b570f8d..151b065 100644 --- a/llvm/utils/profcheck-xfail.txt +++ b/llvm/utils/profcheck-xfail.txt @@ -71,6 +71,7 @@ CodeGen/Hexagon/autohvx/interleave.ll CodeGen/Hexagon/loop-idiom/hexagon-memmove1.ll CodeGen/Hexagon/loop-idiom/hexagon-memmove2.ll CodeGen/Hexagon/loop-idiom/memmove-rt-check.ll +CodeGen/Hexagon/masked_gather.ll CodeGen/NVPTX/lower-ctor-dtor.ll CodeGen/RISCV/zmmul.ll CodeGen/WebAssembly/memory-interleave.ll @@ -331,6 +332,10 @@ Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll Instrumentation/MemorySanitizer/AArch64/module-flags-aarch64.ll Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll Instrumentation/MemorySanitizer/AArch64/qshrn.ll +Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll +Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll +Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll +Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll Instrumentation/MemorySanitizer/AArch64/vararg.ll Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll Instrumentation/MemorySanitizer/abs-vector.ll diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 1437c39..37db096 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1111,9 +1111,9 @@ def AMDGPU_ScaledMFMAOp : AMDGPU_Op<"scaled_mfma", [AllTypesMatch<["destC", "destD"]>, Pure]>, Arguments<(ins - I32Attr:$m, - I32Attr:$n, - I32Attr:$k, + ConfinedAttr<I32Attr, [IntIsOneOf<[16, 32]>]>:$m, + ConfinedAttr<I32Attr, [IntIsOneOf<[16, 32]>]>:$n, + ConfinedAttr<I32Attr, [IntIsOneOf<[64, 128]>]>:$k, ScaledMFMAInTypes:$sourceA, ScaledMFMAInTypes:$sourceB, ScaledMFMAOutTypes:$destC, @@ -1126,8 +1126,8 @@ def AMDGPU_ScaledMFMAOp : let summary = "MLIR wrapper for CDNA scaled mfma instructions"; let description = [{ The `amdgpu.scaled_mfma` op is an MLIR wrapper around intrinsics - for various scaled versions of `mfma` instructions in the CDNA architecture, which perform - multiple outer products in order to allow fast matrix multiplication. + for various scaled versions of `mfma` instructions in the CDNA architecture, which + perform multiple outer products in order to allow fast matrix multiplication. The wrapper will select an appropriate `mfma` instruction, if one is available, based on the provided `m`, `k`, `n`, and `nBlks` attributes, along with the @@ -1142,15 +1142,23 @@ def AMDGPU_ScaledMFMAOp : This wrapper takes inspiration from `amdgpu.mfma`, but has some key differences: - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and - fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile - size. + fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as + their tile size. - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp` - are omitted from this wrapper. - - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for - double-precision operations on gfx94x and so are not included here. + are omitted from this wrapper. + - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported + for double-precision operations on gfx94x and so are not included here. + + Example: + ```mlir + %0 = amdgpu.scaled_mfma 32x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 + : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32> + ``` }]; let assemblyFormat = [{ - `(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC + custom<MNKDimensionList>($m, $n, $k) ` ` + `(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` + `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC attr-dict `:` type($scalesA) `,` type($sourceA) `,` type($scalesB) `,` type($sourceB) `,` type($destC) }]; diff --git a/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir b/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir index 6de55d5..c746d76 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir @@ -55,10 +55,10 @@ func.func @mfma_to_rocdl(%arg0 : vector<8xf16>, %arg1 : vector<16xf32>, // CHECK-LABEL: func @scaled_mfma_to_rocdl( // CHECK-SAME: %[[ARG0:.*]]: vector<16xf32>, %[[ARG1:.*]]: vector<4xf32>, %[[ARG2:.*]]: vector<32xf8E4M3FN>, %[[ARG3:.*]]: vector<32xf8E5M2>, %[[ARG4:.*]]: vector<32xf6E2M3FN>, %[[ARG5:.*]]: vector<32xf6E3M2FN>, %[[ARG6:.*]]: vector<32xf4E2M1FN>, %[[ARG7:.*]]: vector<4xf8E8M0FNU>, %[[ARG8:.*]]: f8E8M0FNU func.func @scaled_mfma_to_rocdl(%arg0 : vector<16xf32>, - %arg1 : vector<4xf32>, %arg2 : vector<32xf8E4M3FN>, - %arg3 : vector<32xf8E5M2>, %arg4 : vector<32xf6E2M3FN>, - %arg5 : vector<32xf6E3M2FN>, %arg6 : vector<32xf4E2M1FN>, - %arg7 : vector<4xf8E8M0FNU>, %arg8 : f8E8M0FNU) { + %arg1 : vector<4xf32>, %arg2 : vector<32xf8E4M3FN>, + %arg3 : vector<32xf8E5M2>, %arg4 : vector<32xf6E2M3FN>, + %arg5 : vector<32xf6E3M2FN>, %arg6 : vector<32xf4E2M1FN>, + %arg7 : vector<4xf8E8M0FNU>, %arg8 : f8E8M0FNU) { // CHECK: %[[c0:.+]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: %[[c1:.+]] = llvm.mlir.constant(1 : i32) : i32 @@ -66,39 +66,39 @@ func.func @scaled_mfma_to_rocdl(%arg0 : vector<16xf32>, // CHECK: %[[z0:.+]] = llvm.zext {{.*}} : i8 to i32 // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<16xf32> + amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<16xf32> // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<4xf32> + amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<4xf32> // CHECK: llvm.bitcast // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<16xf32> + amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<16xf32> // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<4xf32> + amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<4xf32> // CHECK: llvm.bitcast // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32> + amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32> // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<4xf32> + amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<4xf32> // CHECK: llvm.bitcast // CHECK: llvm.mlir.constant(3 : i32) : i32 // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<16xf32> + amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<16xf32> // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<4xf32> + amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<4xf32> // CHECK: llvm.bitcast // CHECK: llvm.mlir.constant(4 : i32) : i32 // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<4xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<16xf32> + amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<16xf32> // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<4xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<4xf32> + amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<4xf32> func.return } diff --git a/mlir/test/Dialect/AMDGPU/canonicalize.mlir b/mlir/test/Dialect/AMDGPU/canonicalize.mlir index 52d3275..fee0c00 100644 --- a/mlir/test/Dialect/AMDGPU/canonicalize.mlir +++ b/mlir/test/Dialect/AMDGPU/canonicalize.mlir @@ -165,10 +165,10 @@ func.func @fold_gather_to_lds_of_cast_dest(%global: memref<128x72xf32, 1>, %lds: // CHECK-LABEL: func @scaled_mfma // CHECK: %[[SCALE_1:.*]] = vector.extract_strided_slice %0 {offsets = [0], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU> // CHECK: %[[SCALE_2:.*]] = vector.extract_strided_slice %2 {offsets = [4], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU> -// CHECK: amdgpu.scaled_mfma(%[[SCALE_1]][3] * %{{.*}}) * (%[[SCALE_2]][2] * %{{.*}}) {{.*}} +// CHECK: amdgpu.scaled_mfma 16x16x128 (%[[SCALE_1]][3] * %{{.*}}) * (%[[SCALE_2]][2] * %{{.*}}) {{.*}} // CHECK: %[[SCALE_3:.*]] = vector.extract_strided_slice %5 {offsets = [8], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU> // CHECK: %[[SCALE_4:.*]] = vector.extract_strided_slice %7 {offsets = [12], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU> -// CHECK: amdgpu.scaled_mfma(%[[SCALE_3]][1] * %{{.*}}) * (%[[SCALE_4]][0] * %{{.*}}) {{.*}} +// CHECK: amdgpu.scaled_mfma 16x16x128 (%[[SCALE_3]][1] * %{{.*}}) * (%[[SCALE_4]][0] * %{{.*}}) {{.*}} func.func @scaled_mfma(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %scalesA: vector<2x1x8x1xf8E8M0FNU>, %scalesB: vector<2x1x8x1xf8E8M0FNU>) -> (vector<4xf32>, vector<4xf32>) { %cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32> %cst_1 = arith.constant dense<5.877470e-39> : vector<4xf8E8M0FNU> @@ -176,12 +176,12 @@ func.func @scaled_mfma(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %sc %sA = vector.insert %scaleA, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> %scaleB = vector.extract %scalesB[0, 0, 6, 0] : f8E8M0FNU from vector<2x1x8x1xf8E8M0FNU> %sB = vector.insert %scaleB, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> - %res_0 = amdgpu.scaled_mfma(%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> + %res_0 = amdgpu.scaled_mfma 16x16x128 (%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> %scaleC = vector.extract %scalesA[1, 0, 1, 0] : f8E8M0FNU from vector<2x1x8x1xf8E8M0FNU> %sC = vector.insert %scaleC, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> %scaleD = vector.extract %scalesB[1, 0, 4, 0] : f8E8M0FNU from vector<2x1x8x1xf8E8M0FNU> %sD = vector.insert %scaleD, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> - %res_1 = amdgpu.scaled_mfma(%sC[0] * %opA) * (%sD[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> + %res_1 = amdgpu.scaled_mfma 16x16x128 (%sC[0] * %opA) * (%sD[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> return %res_0, %res_1 : vector<4xf32>, vector<4xf32> } @@ -192,7 +192,7 @@ func.func @scaled_mfma(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %sc // CHECK: vector.insert {{.*}} : f8E8M0FNU into vector<4xf8E8M0FNU> // CHECK: vector.extract {{.*}} : f8E8M0FNU from vector<2xf8E8M0FNU> // CHECK: vector.insert {{.*}} : f8E8M0FNU into vector<4xf8E8M0FNU> -// CHECK: amdgpu.scaled_mfma({{.*}}[0] * {{.*}}) * ({{.*}}[0] * {{.*}} +// CHECK: amdgpu.scaled_mfma 16x16x128 ({{.*}}[0] * {{.*}}) * ({{.*}}[0] * {{.*}} func.func @scaled_mfma_less_than_4(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %scalesA: vector<2xf8E8M0FNU>, %scalesB: vector<2xf8E8M0FNU>) -> vector<4xf32> { %cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32> %cst_1 = arith.constant dense<5.877470e-39> : vector<4xf8E8M0FNU> @@ -200,17 +200,17 @@ func.func @scaled_mfma_less_than_4(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4 %sA = vector.insert %scaleA, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> %scaleB = vector.extract %scalesB[1] : f8E8M0FNU from vector<2xf8E8M0FNU> %sB = vector.insert %scaleB, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> - %res_0 = amdgpu.scaled_mfma(%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> + %res_0 = amdgpu.scaled_mfma 16x16x128 (%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> return %res_0 : vector<4xf32> } // ----- // CHECK-LABEL: func @scaled_mfma_ugly_shapes -// CHECK: amdgpu.scaled_mfma(%{{.*}}[0] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> -// CHECK: amdgpu.scaled_mfma(%{{.*}}[1] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> -// CHECK: amdgpu.scaled_mfma(%{{.*}}[2] * %{{.*}}) * (%{{.*}}[2] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> -// CHECK: amdgpu.scaled_mfma(%{{.*}}[3] * %{{.*}}) * (%{{.*}}[1] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> +// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[0] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> +// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[1] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> +// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[2] * %{{.*}}) * (%{{.*}}[2] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> +// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[3] * %{{.*}}) * (%{{.*}}[1] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> func.func @scaled_mfma_ugly_shapes(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %scalesA: vector<5x5xf8E8M0FNU>, %scalesB: vector<7x23xf8E8M0FNU>) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) { %cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32> %cst_1 = arith.constant dense<5.877470e-39> : vector<4xf8E8M0FNU> @@ -237,10 +237,10 @@ func.func @scaled_mfma_ugly_shapes(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4 %sB_6_21 = vector.insert %scaleB_6_21, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> %sB_6_20 = vector.insert %scaleB_6_20, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> %sB_6_19 = vector.insert %scaleB_6_19, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> - - %res_4 = amdgpu.scaled_mfma(%sA_0_4[0] * %opA) * (%sB_6_22[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> - %res_5 = amdgpu.scaled_mfma(%sA_0_5[0] * %opA) * (%sB_6_21[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> - %res_6 = amdgpu.scaled_mfma(%sA_0_6[0] * %opA) * (%sB_6_20[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> - %res_7 = amdgpu.scaled_mfma(%sA_0_7[0] * %opA) * (%sB_6_19[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> + + %res_4 = amdgpu.scaled_mfma 16x16x128 (%sA_0_4[0] * %opA) * (%sB_6_22[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> + %res_5 = amdgpu.scaled_mfma 16x16x128 (%sA_0_5[0] * %opA) * (%sB_6_21[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> + %res_6 = amdgpu.scaled_mfma 16x16x128 (%sA_0_6[0] * %opA) * (%sB_6_20[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> + %res_7 = amdgpu.scaled_mfma 16x16x128 (%sA_0_7[0] * %opA) * (%sB_6_19[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> return %res_4, %res_5, %res_6, %res_7 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32> } diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index b27dfdf..5784764 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -314,3 +314,27 @@ func.func @amdgpu.scaled_ext_packed816_invalid_input_output_sizes(%v: vector<8xf %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<16xf16> func.return } + +// ----- + +func.func @scaled_mfma_invalid_m(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> { + // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'm' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}} + %0 = amdgpu.scaled_mfma 8x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32> + func.return %0 : vector<16xf32> +} + +// ----- + +func.func @scaled_mfma_invalid_n(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> { + // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'n' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}} + %0 = amdgpu.scaled_mfma 32x8x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32> + func.return %0 : vector<16xf32> +} + +// ----- + +func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> { + // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'k' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {64, 128}}} + %0 = amdgpu.scaled_mfma 32x32x32 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32> + func.return %0 : vector<16xf32> +} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index d0bf0d7..a330967 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -609,8 +609,8 @@ func.func @permlane32_swap(%arg0 : f32) -> f32 { // CHECK-LABEL: func @scaled_mfma func.func @scaled_mfma(%arg0 : f8E8M0FNU, %arg1 : vector<32xf6E2M3FN>, %arg2 : vector<16xf32>) -> vector<16xf32> { - // CHECK: amdgpu.scaled_mfma - %0 = amdgpu.scaled_mfma(%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : f8E8M0FNU, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32> + // CHECK: amdgpu.scaled_mfma 32x32x64 + %0 = amdgpu.scaled_mfma 32x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : f8E8M0FNU, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32> func.return %0 : vector<16xf32> } |
