diff options
141 files changed, 12735 insertions, 2678 deletions
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h index 082f1ce..8960b19 100644 --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -190,6 +190,9 @@ class BinaryContext { /// Unique build ID if available for the binary. std::optional<std::string> FileBuildID; + /// GNU property note indicating AArch64 BTI. + bool UsesBTI{false}; + /// Set of all sections. struct CompareSections { bool operator()(const BinarySection *A, const BinarySection *B) const { @@ -384,6 +387,9 @@ public: } void setFileBuildID(StringRef ID) { FileBuildID = std::string(ID); } + bool usesBTI() const { return UsesBTI; } + void setUsesBTI(bool Value) { UsesBTI = Value; } + bool hasSymbolsWithFileName() const { return HasSymbolsWithFileName; } void setHasSymbolsWithFileName(bool Value) { HasSymbolsWithFileName = Value; } diff --git a/bolt/include/bolt/Passes/SplitFunctions.h b/bolt/include/bolt/Passes/SplitFunctions.h index 8bdc48b..2c1bf18 100644 --- a/bolt/include/bolt/Passes/SplitFunctions.h +++ b/bolt/include/bolt/Passes/SplitFunctions.h @@ -18,25 +18,6 @@ namespace llvm { namespace bolt { -/// Strategy used to partition blocks into fragments. -enum SplitFunctionsStrategy : char { - /// Split each function into a hot and cold fragment using profiling - /// information. - Profile2 = 0, - /// Split each function into a hot, warm, and cold fragment using - /// profiling information. - CDSplit, - /// Split each function into a hot and cold fragment at a randomly chosen - /// split point (ignoring any available profiling information). - Random2, - /// Split each function into N fragments at a randomly chosen split points - /// (ignoring any available profiling information). - RandomN, - /// Split all basic blocks of each function into fragments such that each - /// fragment contains exactly a single basic block. - All -}; - class SplitStrategy { public: using BlockIt = BinaryFunction::BasicBlockOrderType::iterator; diff --git a/bolt/include/bolt/Rewrite/MetadataRewriters.h b/bolt/include/bolt/Rewrite/MetadataRewriters.h index b71bd6c..2c09c879 100644 --- a/bolt/include/bolt/Rewrite/MetadataRewriters.h +++ b/bolt/include/bolt/Rewrite/MetadataRewriters.h @@ -27,6 +27,8 @@ std::unique_ptr<MetadataRewriter> createPseudoProbeRewriter(BinaryContext &); std::unique_ptr<MetadataRewriter> createSDTRewriter(BinaryContext &); +std::unique_ptr<MetadataRewriter> createGNUPropertyRewriter(BinaryContext &); + } // namespace bolt } // namespace llvm diff --git a/bolt/include/bolt/Utils/CommandLineOpts.h b/bolt/include/bolt/Utils/CommandLineOpts.h index 859d6f3..0964c2c 100644 --- a/bolt/include/bolt/Utils/CommandLineOpts.h +++ b/bolt/include/bolt/Utils/CommandLineOpts.h @@ -29,6 +29,25 @@ enum HeatmapModeKind { HM_Optional // perf2bolt --heatmap }; +/// Strategy used to partition blocks into fragments. +enum SplitFunctionsStrategy : char { + /// Split each function into a hot and cold fragment using profiling + /// information. + Profile2 = 0, + /// Split each function into a hot, warm, and cold fragment using + /// profiling information. + CDSplit, + /// Split each function into a hot and cold fragment at a randomly chosen + /// split point (ignoring any available profiling information). + Random2, + /// Split each function into N fragments at a randomly chosen split points + /// (ignoring any available profiling information). + RandomN, + /// Split all basic blocks of each function into fragments such that each + /// fragment contains exactly a single basic block. + All +}; + using HeatmapBlockSizes = std::vector<unsigned>; struct HeatmapBlockSpecParser : public llvm::cl::parser<HeatmapBlockSizes> { explicit HeatmapBlockSpecParser(llvm::cl::Option &O) @@ -78,6 +97,7 @@ extern llvm::cl::opt<std::string> OutputFilename; extern llvm::cl::opt<std::string> PerfData; extern llvm::cl::opt<bool> PrintCacheMetrics; extern llvm::cl::opt<bool> PrintSections; +extern llvm::cl::opt<SplitFunctionsStrategy> SplitStrategy; // The format to use with -o in aggregation mode (perf2bolt) enum ProfileFormatKind { PF_Fdata, PF_YAML }; diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp index 4dade16..03c1ea9 100644 --- a/bolt/lib/Passes/LongJmp.cpp +++ b/bolt/lib/Passes/LongJmp.cpp @@ -895,6 +895,10 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) { Error LongJmpPass::runOnFunctions(BinaryContext &BC) { + assert((opts::CompactCodeModel || + opts::SplitStrategy != opts::SplitFunctionsStrategy::CDSplit) && + "LongJmp cannot work with functions split in more than two fragments"); + if (opts::CompactCodeModel) { BC.outs() << "BOLT-INFO: relaxing branches for compact code model (<128MB)\n"; diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp index b21401e..eab669b 100644 --- a/bolt/lib/Passes/SplitFunctions.cpp +++ b/bolt/lib/Passes/SplitFunctions.cpp @@ -86,29 +86,6 @@ static cl::opt<unsigned> SplitThreshold( "increase after splitting."), cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); -static cl::opt<SplitFunctionsStrategy> SplitStrategy( - "split-strategy", cl::init(SplitFunctionsStrategy::Profile2), - cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2", - "split each function into a hot and cold fragment " - "using profiling information")), - cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit", - "split each function into a hot, warm, and cold " - "fragment using profiling information")), - cl::values(clEnumValN( - SplitFunctionsStrategy::Random2, "random2", - "split each function into a hot and cold fragment at a randomly chosen " - "split point (ignoring any available profiling information)")), - cl::values(clEnumValN( - SplitFunctionsStrategy::RandomN, "randomN", - "split each function into N fragments at a randomly chosen split " - "points (ignoring any available profiling information)")), - cl::values(clEnumValN( - SplitFunctionsStrategy::All, "all", - "split all basic blocks of each function into fragments such that each " - "fragment contains exactly a single basic block")), - cl::desc("strategy used to partition blocks into fragments"), - cl::cat(BoltOptCategory)); - static cl::opt<double> CallScale( "call-scale", cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"), @@ -724,14 +701,14 @@ Error SplitFunctions::runOnFunctions(BinaryContext &BC) { // If split strategy is not CDSplit, then a second run of the pass is not // needed after function reordering. if (BC.HasFinalizedFunctionOrder && - opts::SplitStrategy != SplitFunctionsStrategy::CDSplit) + opts::SplitStrategy != opts::SplitFunctionsStrategy::CDSplit) return Error::success(); std::unique_ptr<SplitStrategy> Strategy; bool ForceSequential = false; switch (opts::SplitStrategy) { - case SplitFunctionsStrategy::CDSplit: + case opts::SplitFunctionsStrategy::CDSplit: // CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2) // before function reordering and hot-warm-cold splitting // (SplitCacheDirected) after function reordering. @@ -742,21 +719,21 @@ Error SplitFunctions::runOnFunctions(BinaryContext &BC) { opts::AggressiveSplitting = true; BC.HasWarmSection = true; break; - case SplitFunctionsStrategy::Profile2: + case opts::SplitFunctionsStrategy::Profile2: Strategy = std::make_unique<SplitProfile2>(); break; - case SplitFunctionsStrategy::Random2: + case opts::SplitFunctionsStrategy::Random2: Strategy = std::make_unique<SplitRandom2>(); // If we split functions randomly, we need to ensure that across runs with // the same input, we generate random numbers for each function in the same // order. ForceSequential = true; break; - case SplitFunctionsStrategy::RandomN: + case opts::SplitFunctionsStrategy::RandomN: Strategy = std::make_unique<SplitRandomN>(); ForceSequential = true; break; - case SplitFunctionsStrategy::All: + case opts::SplitFunctionsStrategy::All: Strategy = std::make_unique<SplitAll>(); break; } diff --git a/bolt/lib/Rewrite/CMakeLists.txt b/bolt/lib/Rewrite/CMakeLists.txt index 7750360..5b15edc 100644 --- a/bolt/lib/Rewrite/CMakeLists.txt +++ b/bolt/lib/Rewrite/CMakeLists.txt @@ -25,6 +25,7 @@ add_llvm_library(LLVMBOLTRewrite PseudoProbeRewriter.cpp RewriteInstance.cpp SDTRewriter.cpp + GNUPropertyRewriter.cpp NO_EXPORT DISABLE_LLVM_LINK_LLVM_DYLIB diff --git a/bolt/lib/Rewrite/GNUPropertyRewriter.cpp b/bolt/lib/Rewrite/GNUPropertyRewriter.cpp new file mode 100644 index 0000000..f61c08e --- /dev/null +++ b/bolt/lib/Rewrite/GNUPropertyRewriter.cpp @@ -0,0 +1,147 @@ +//===- bolt/Rewrite/GNUPropertyRewriter.cpp -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Read the .note.gnu.property section. +// +//===----------------------------------------------------------------------===// + +#include "bolt/Rewrite/MetadataRewriter.h" +#include "bolt/Rewrite/MetadataRewriters.h" +#include "llvm/Support/Errc.h" + +using namespace llvm; +using namespace bolt; + +namespace { + +class GNUPropertyRewriter final : public MetadataRewriter { + + Expected<uint32_t> decodeGNUPropertyNote(StringRef Desc); + +public: + GNUPropertyRewriter(StringRef Name, BinaryContext &BC) + : MetadataRewriter(Name, BC) {} + + Error sectionInitializer() override; +}; + +Error GNUPropertyRewriter::sectionInitializer() { + + ErrorOr<BinarySection &> Sec = + BC.getUniqueSectionByName(".note.gnu.property"); + if (!Sec) + return Error::success(); + + // Accumulate feature bits + uint32_t FeaturesAcc = 0; + + StringRef Buf = Sec->getContents(); + DataExtractor DE(Buf, BC.AsmInfo->isLittleEndian(), + BC.AsmInfo->getCodePointerSize()); + DataExtractor::Cursor Cursor(0); + while (Cursor && !DE.eof(Cursor)) { + const uint32_t NameSz = DE.getU32(Cursor); + const uint32_t DescSz = DE.getU32(Cursor); + const uint32_t Type = DE.getU32(Cursor); + + StringRef Name = + NameSz ? Buf.slice(Cursor.tell(), Cursor.tell() + NameSz) : "<empty>"; + Cursor.seek(alignTo(Cursor.tell() + NameSz, 4)); + + const uint64_t DescOffset = Cursor.tell(); + StringRef Desc = + DescSz ? Buf.slice(DescOffset, DescOffset + DescSz) : "<empty>"; + Cursor.seek(alignTo(DescOffset + DescSz, 4)); + if (!Cursor) + return createStringError( + errc::executable_format_error, + "out of bounds while reading .note.gnu.property section: %s", + toString(Cursor.takeError()).c_str()); + + if (Type == ELF::NT_GNU_PROPERTY_TYPE_0 && Name.starts_with("GNU") && + DescSz) { + auto Features = decodeGNUPropertyNote(Desc); + if (!Features) + return Features.takeError(); + FeaturesAcc |= *Features; + } + } + + if (BC.isAArch64()) { + BC.setUsesBTI(FeaturesAcc & llvm::ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI); + if (BC.usesBTI()) + BC.outs() << "BOLT-WARNING: binary is using BTI. Optimized binary may be " + "corrupted\n"; + } + + return Error::success(); +} + +/// \p Desc contains an array of property descriptors. Each member has the +/// following structure: +/// typedef struct { +/// Elf_Word pr_type; +/// Elf_Word pr_datasz; +/// unsigned char pr_data[PR_DATASZ]; +/// unsigned char pr_padding[PR_PADDING]; +/// } Elf_Prop; +/// +/// As there is no guarantee that the features are encoded in which element of +/// the array, we have to read all, and OR together the result. +Expected<uint32_t> GNUPropertyRewriter::decodeGNUPropertyNote(StringRef Desc) { + DataExtractor DE(Desc, BC.AsmInfo->isLittleEndian(), + BC.AsmInfo->getCodePointerSize()); + DataExtractor::Cursor Cursor(0); + const uint32_t Align = DE.getAddressSize(); + + std::optional<uint32_t> Features = 0; + while (Cursor && !DE.eof(Cursor)) { + const uint32_t PrType = DE.getU32(Cursor); + const uint32_t PrDataSz = DE.getU32(Cursor); + + const uint64_t PrDataStart = Cursor.tell(); + const uint64_t PrDataEnd = PrDataStart + PrDataSz; + Cursor.seek(PrDataEnd); + if (!Cursor) + return createStringError( + errc::executable_format_error, + "out of bounds while reading .note.gnu.property section: %s", + toString(Cursor.takeError()).c_str()); + + if (PrType == llvm::ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND) { + if (PrDataSz != 4) { + return createStringError( + errc::executable_format_error, + "Property descriptor size has to be 4 bytes on AArch64\n"); + } + DataExtractor::Cursor Tmp(PrDataStart); + // PrDataSz = 4 -> PrData is uint32_t + const uint32_t FeaturesItem = DE.getU32(Tmp); + if (!Tmp) + return createStringError( + errc::executable_format_error, + "failed to read property from .note.gnu.property section: %s", + toString(Tmp.takeError()).c_str()); + Features = Features ? (*Features | FeaturesItem) : FeaturesItem; + } + + Cursor.seek(alignTo(PrDataEnd, Align)); + if (!Cursor) + return createStringError(errc::executable_format_error, + "out of bounds while reading property array in " + ".note.gnu.property section: %s", + toString(Cursor.takeError()).c_str()); + } + return Features.value_or(0u); +} +} // namespace + +std::unique_ptr<MetadataRewriter> +llvm::bolt::createGNUPropertyRewriter(BinaryContext &BC) { + return std::make_unique<GNUPropertyRewriter>("gnu-property-rewriter", BC); +} diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index c13a9f0..bfd03e0 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -2115,6 +2115,13 @@ void RewriteInstance::adjustCommandLineOptions() { opts::SplitEH = false; } + if (BC->isAArch64() && !opts::CompactCodeModel && + opts::SplitStrategy == opts::SplitFunctionsStrategy::CDSplit) { + BC->errs() << "BOLT-ERROR: CDSplit is not supported with LongJmp. Try with " + "'--compact-code-model'\n"; + exit(1); + } + if (opts::StrictMode && !BC->HasRelocations) { BC->errs() << "BOLT-WARNING: disabling strict mode (-strict) in non-relocation " @@ -3331,6 +3338,8 @@ void RewriteInstance::initializeMetadataManager() { MetadataManager.registerRewriter(createPseudoProbeRewriter(*BC)); MetadataManager.registerRewriter(createSDTRewriter(*BC)); + + MetadataManager.registerRewriter(createGNUPropertyRewriter(*BC)); } void RewriteInstance::processSectionMetadata() { diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp index 5635da4..095612a 100644 --- a/bolt/lib/Utils/CommandLineOpts.cpp +++ b/bolt/lib/Utils/CommandLineOpts.cpp @@ -104,6 +104,29 @@ ExecutionCountThreshold("execution-count-threshold", cl::Hidden, cl::cat(BoltOptCategory)); +cl::opt<SplitFunctionsStrategy> SplitStrategy( + "split-strategy", cl::init(SplitFunctionsStrategy::Profile2), + cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2", + "split each function into a hot and cold fragment " + "using profiling information")), + cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit", + "split each function into a hot, warm, and cold " + "fragment using profiling information")), + cl::values(clEnumValN( + SplitFunctionsStrategy::Random2, "random2", + "split each function into a hot and cold fragment at a randomly chosen " + "split point (ignoring any available profiling information)")), + cl::values(clEnumValN( + SplitFunctionsStrategy::RandomN, "randomN", + "split each function into N fragments at a randomly chosen split " + "points (ignoring any available profiling information)")), + cl::values(clEnumValN( + SplitFunctionsStrategy::All, "all", + "split all basic blocks of each function into fragments such that each " + "fragment contains exactly a single basic block")), + cl::desc("strategy used to partition blocks into fragments"), + cl::cat(BoltOptCategory)); + bool HeatmapBlockSpecParser::parse(cl::Option &O, StringRef ArgName, StringRef Arg, HeatmapBlockSizes &Val) { // Parses a human-readable suffix into a shift amount or nullopt on error. diff --git a/bolt/test/AArch64/Inputs/property-note-bti.yaml b/bolt/test/AArch64/Inputs/property-note-bti.yaml new file mode 100644 index 0000000..541ae92 --- /dev/null +++ b/bolt/test/AArch64/Inputs/property-note-bti.yaml @@ -0,0 +1,50 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 + Entry: 0x400510 +ProgramHeaders: + - Type: PT_NOTE + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .note.gnu.property + VAddr: 0x400338 + Align: 0x8 + - Type: PT_LOAD + Flags: [ PF_R ] + VAddr: 0x0 + Align: 0x10000 + FileSize: 0xf8 + MemSize: 0xf8 + Offset: 0x0 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x2a0000 + AddressAlign: 0x4 + Content: 400580d2c0035fd6 + - Name: .note.gnu.property + Type: SHT_NOTE + Flags: [ SHF_ALLOC ] + Address: 0x400338 + AddressAlign: 0x8 + Notes: + - Name: GNU + Desc: 000000C0040000000300000000000000 + Type: NT_GNU_PROPERTY_TYPE_0 + - Type: SectionHeaderTable + Sections: + - Name: .note.gnu.property + - Name: .symtab + - Name: .strtab + - Name: .shstrtab + - Name: .text +Symbols: + - Name: .note.gnu.property + Type: STT_SECTION + Section: .note.gnu.property + Value: 0x400338 +... diff --git a/bolt/test/AArch64/Inputs/property-note-nobti.yaml b/bolt/test/AArch64/Inputs/property-note-nobti.yaml new file mode 100644 index 0000000..a041a58 --- /dev/null +++ b/bolt/test/AArch64/Inputs/property-note-nobti.yaml @@ -0,0 +1,50 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 + Entry: 0x400510 +ProgramHeaders: + - Type: PT_NOTE + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .note.gnu.property + VAddr: 0x400338 + Align: 0x8 + - Type: PT_LOAD + Flags: [ PF_R ] + VAddr: 0x0 + Align: 0x10000 + FileSize: 0xf8 + MemSize: 0xf8 + Offset: 0x0 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x2a0000 + AddressAlign: 0x4 + Content: 400580d2c0035fd6 + - Name: .note.gnu.property + Type: SHT_NOTE + Flags: [ SHF_ALLOC ] + Address: 0x400338 + AddressAlign: 0x8 + Notes: + - Name: GNU + Desc: 000000C0040000000200000000000000 + Type: NT_GNU_PROPERTY_TYPE_0 + - Type: SectionHeaderTable + Sections: + - Name: .note.gnu.property + - Name: .symtab + - Name: .strtab + - Name: .shstrtab + - Name: .text +Symbols: + - Name: .note.gnu.property + Type: STT_SECTION + Section: .note.gnu.property + Value: 0x400338 +... diff --git a/bolt/test/AArch64/bti-note.test b/bolt/test/AArch64/bti-note.test new file mode 100644 index 0000000..1ec9d77 --- /dev/null +++ b/bolt/test/AArch64/bti-note.test @@ -0,0 +1,10 @@ +// This test checks that the GNUPropertyRewriter can decode the BTI feature flag. +// It decodes an executable with BTI, and checks for the warning. + +RUN: yaml2obj %p/Inputs/property-note-bti.yaml &> %t.exe + +RUN: llvm-readelf -n %t.exe | FileCheck %s +CHECK: BTI + +RUN: llvm-bolt %t.exe -o %t.exe.bolt | FileCheck %s -check-prefix=CHECK-BOLT +CHECK-BOLT: BOLT-WARNING: binary is using BTI. Optimized binary may be corrupted diff --git a/bolt/test/AArch64/no-bti-note.test b/bolt/test/AArch64/no-bti-note.test new file mode 100644 index 0000000..28cce34 --- /dev/null +++ b/bolt/test/AArch64/no-bti-note.test @@ -0,0 +1,10 @@ +// This test checks that the GNUPropertyRewriter can decode the BTI feature flag. +// It decodes an executable without BTI, and checks for the warning. + +RUN: yaml2obj %p/Inputs/property-note-nobti.yaml &> %t.exe + +RUN: llvm-readelf -n %t.exe | FileCheck %s +CHECK-NOT: BTI + +RUN: llvm-bolt %t.exe -o %t.exe.bolt | FileCheck %s -check-prefix=CHECK-BOLT +CHECK-BOLT-NOT: BOLT-WARNING: binary is using BTI. Optimized binary may be corrupted diff --git a/bolt/test/AArch64/unsupported-passes.test b/bolt/test/AArch64/unsupported-passes.test index 886fc1c..5b12d86 100644 --- a/bolt/test/AArch64/unsupported-passes.test +++ b/bolt/test/AArch64/unsupported-passes.test @@ -3,6 +3,9 @@ // REQUIRES: system-linux,asserts,target=aarch64{{.*}} RUN: %clang %cflags %p/../Inputs/hello.c -o %t -Wl,-q -RUN: not llvm-bolt %t -o %t.bolt --frame-opt=all 2>&1 | FileCheck %s +RUN: not llvm-bolt %t -o %t.bolt --frame-opt=all 2>&1 | FileCheck %s --check-prefix=CHECK-FRAME-OPT -CHECK: BOLT-ERROR: frame-optimizer is supported only on X86 +CHECK-FRAME-OPT: BOLT-ERROR: frame-optimizer is supported only on X86 + +RUN: not llvm-bolt %t -o %t.bolt split-functions --split-strategy=cdsplit 2>&1 | FileCheck %s --check-prefix=CHECK-CDSPLIT +CHECK-CDSPLIT: BOLT-ERROR: CDSplit is not supported with LongJmp. Try with '--compact-code-model' diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst index bd74227..c677ddfa 100644 --- a/clang/docs/InternalsManual.rst +++ b/clang/docs/InternalsManual.rst @@ -2859,6 +2859,67 @@ This library is called by the :ref:`Parser library <Parser>` during parsing to do semantic analysis of the input. For valid programs, Sema builds an AST for parsed constructs. + +Concept Satisfaction Checking and Subsumption +--------------------------------------------- + +As per the C++ standard, constraints are `normalized <https://eel.is/c++draft/temp.constr.normal>`_ +and the normal form is used both for subsumption, and constraint checking. +Both depend on a parameter mapping that substitutes lazily. In particular, +we should not substitute in unused arguments. + +Clang follows the order of operations prescribed by the standard. + +Normalization happens prior to satisfaction and subsumption +and is handled by ``NormalizedConstraint``. + +Clang preserves in the normalized form intermediate concept-ids +(``ConceptIdConstraint``) This is used for diagnostics only and no substitution +happens in a ConceptIdConstraint if its expression is satisfied. + +The normal form of the associated constraints of a declaration is cached in +Sema::NormalizationCache such that it is only computed once. + +A ``NormalizedConstraint`` is a recursive data structure, where each node +contains a parameter mapping, represented by the indexes of all parameter +being used. + +Checking satisfaction is done by ``ConstraintSatisfactionChecker``, recursively +walking ``NormalizedConstraint``. At each level, we substitute the outermost +level of the template arguments referenced in the parameter mapping of a +normalized expression (``MultiLevelTemplateArgumentList``). + +For the following example, + +.. code-block:: c++ + + template <typename T> + concept A = __is_same(T, int); + + template <typename U> + concept B = A<U> && __is_same(U, int); + +The normal form of B is + +.. code-block:: c++ + + __is_same(T, int) /*T->U, innermost level*/ + && __is_same(U, int) {U->U} /*T->U, outermost level*/ + +After substitution in the mapping, we substitute in the constraint expression +using that copy of the ``MultiLevelTemplateArgumentList``, and then evaluate it. + +Because this is expensive, it is cached in +``UnsubstitutedConstraintSatisfactionCache``. + +Any error during satisfaction is recorded in ``ConstraintSatisfaction``. +for nested requirements, ``ConstraintSatisfaction`` is stored (including +diagnostics) in the AST, which is something we might want to improve. + +When an atomic constraint is not satified, we try to substitute into any +enclosing concept-id using the same mechanism described above, for +diagnostics purpose, and inject that in the ``ConstraintSatisfaction``. + .. _CodeGen: The CodeGen Library diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 145a83a..d2e5bd2 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -160,6 +160,10 @@ C++23 Feature Support C++20 Feature Support ^^^^^^^^^^^^^^^^^^^^^ +- Clang now normalizes constraints before checking whether they are satisfied, as mandated by the standard. + As a result, Clang no longer incorrectly diagnoses substitution failures in template arguments only + used in concept-ids, and produces better diagnostics for satisfaction failure. (#GH61811) (#GH135190) + C++17 Feature Support ^^^^^^^^^^^^^^^^^^^^^ @@ -361,7 +365,7 @@ Bug Fixes in This Version first parameter. (#GH113323). - Fixed a crash with incompatible pointer to integer conversions in designated initializers involving string literals. (#GH154046) -- Fix crash on CTAD for alias template. (#GH131342) +- Fix crash on CTAD for alias template. (#GH131342), (#GH131408) - Clang now emits a frontend error when a function marked with the `flatten` attribute calls another function that requires target features not enabled in the caller. This prevents a fatal error in the backend. diff --git a/clang/include/clang/AST/ASTConcept.h b/clang/include/clang/AST/ASTConcept.h index 72da005..f362f24 100644 --- a/clang/include/clang/AST/ASTConcept.h +++ b/clang/include/clang/AST/ASTConcept.h @@ -28,10 +28,20 @@ namespace clang { class ConceptDecl; class TemplateDecl; +class ConceptReference; class Expr; class NamedDecl; struct PrintingPolicy; +/// Unsatisfied constraint expressions if the template arguments could be +/// substituted into them, or a diagnostic if substitution resulted in +/// an invalid expression. +/// +using ConstraintSubstitutionDiagnostic = std::pair<SourceLocation, StringRef>; +using UnsatisfiedConstraintRecord = + llvm::PointerUnion<const Expr *, const ConceptReference *, + const ConstraintSubstitutionDiagnostic *>; + /// The result of a constraint satisfaction check, containing the necessary /// information to diagnose an unsatisfied constraint. class ConstraintSatisfaction : public llvm::FoldingSetNode { @@ -48,16 +58,13 @@ public: ArrayRef<TemplateArgument> TemplateArgs) : ConstraintOwner(ConstraintOwner), TemplateArgs(TemplateArgs) {} - using SubstitutionDiagnostic = std::pair<SourceLocation, StringRef>; - using Detail = llvm::PointerUnion<Expr *, SubstitutionDiagnostic *>; - bool IsSatisfied = false; bool ContainsErrors = false; /// \brief The substituted constraint expr, if the template arguments could be /// substituted into them, or a diagnostic if substitution resulted in an /// invalid expression. - llvm::SmallVector<Detail, 4> Details; + llvm::SmallVector<UnsatisfiedConstraintRecord, 4> Details; void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &C) { Profile(ID, C, ConstraintOwner, TemplateArgs); @@ -69,19 +76,12 @@ public: bool HasSubstitutionFailure() { for (const auto &Detail : Details) - if (Detail.dyn_cast<SubstitutionDiagnostic *>()) + if (Detail.dyn_cast<const ConstraintSubstitutionDiagnostic *>()) return true; return false; } }; -/// Pairs of unsatisfied atomic constraint expressions along with the -/// substituted constraint expr, if the template arguments could be -/// substituted into them, or a diagnostic if substitution resulted in -/// an invalid expression. -using UnsatisfiedConstraintRecord = - llvm::PointerUnion<Expr *, std::pair<SourceLocation, StringRef> *>; - /// \brief The result of a constraint satisfaction check, containing the /// necessary information to diagnose an unsatisfied constraint. /// @@ -101,6 +101,10 @@ struct ASTConstraintSatisfaction final : return getTrailingObjects() + NumRecords; } + ArrayRef<UnsatisfiedConstraintRecord> records() const { + return {begin(), end()}; + } + ASTConstraintSatisfaction(const ASTContext &C, const ConstraintSatisfaction &Satisfaction); ASTConstraintSatisfaction(const ASTContext &C, @@ -282,6 +286,11 @@ public: } }; +/// Insertion operator for diagnostics. This allows sending ConceptReferences's +/// into a diagnostic with <<. +const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB, + const ConceptReference *C); + } // clang #endif // LLVM_CLANG_AST_ASTCONCEPT_H diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 12351e9..78220d4 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -3877,7 +3877,6 @@ typename clang::LazyGenerationalUpdatePtr<Owner, T, Update>::ValueType return new (Ctx) LazyData(Source, Value); return Value; } - template <> struct llvm::DenseMapInfo<llvm::FoldingSetNodeID> { static FoldingSetNodeID getEmptyKey() { return FoldingSetNodeID{}; } diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index f53aafd..265462a 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -65,6 +65,7 @@ #include "clang/Sema/Redeclaration.h" #include "clang/Sema/Scope.h" #include "clang/Sema/SemaBase.h" +#include "clang/Sema/SemaConcept.h" #include "clang/Sema/TypoCorrection.h" #include "clang/Sema/Weak.h" #include "llvm/ADT/APInt.h" @@ -11694,8 +11695,9 @@ public: ExprResult CheckConceptTemplateId(const CXXScopeSpec &SS, SourceLocation TemplateKWLoc, const DeclarationNameInfo &ConceptNameInfo, - NamedDecl *FoundDecl, ConceptDecl *NamedConcept, - const TemplateArgumentListInfo *TemplateArgs); + NamedDecl *FoundDecl, TemplateDecl *NamedConcept, + const TemplateArgumentListInfo *TemplateArgs, + bool DoCheckConstraintSatisfaction = true); void diagnoseMissingTemplateArguments(TemplateName Name, SourceLocation Loc); void diagnoseMissingTemplateArguments(const CXXScopeSpec &SS, @@ -12025,6 +12027,13 @@ public: bool UpdateArgsWithConversions = true, bool *ConstraintsNotSatisfied = nullptr); + bool CheckTemplateArgumentList( + TemplateDecl *Template, TemplateParameterList *Params, + SourceLocation TemplateLoc, TemplateArgumentListInfo &TemplateArgs, + const DefaultArguments &DefaultArgs, bool PartialTemplateArgs, + CheckTemplateArgumentInfo &CTAI, bool UpdateArgsWithConversions = true, + bool *ConstraintsNotSatisfied = nullptr); + bool CheckTemplateTypeArgument( TemplateTypeParmDecl *Param, TemplateArgumentLoc &Arg, SmallVectorImpl<TemplateArgument> &SugaredConverted, @@ -12783,6 +12792,18 @@ public: void MarkUsedTemplateParameters(const Expr *E, bool OnlyDeduced, unsigned Depth, llvm::SmallBitVector &Used); + /// Mark which template parameters are named in a given expression. + /// + /// Unlike MarkUsedTemplateParameters, this excludes parameter that + /// are used but not directly named by an expression - i.e. it excludes + /// any template parameter that denotes the type of a referenced NTTP. + /// + /// \param Used a bit vector whose elements will be set to \c true + /// to indicate when the corresponding template parameter will be + /// deduced. + void MarkUsedTemplateParametersForSubsumptionParameterMapping( + const Expr *E, unsigned Depth, llvm::SmallBitVector &Used); + /// Mark which template parameters can be deduced from a given /// template argument list. /// @@ -12799,6 +12820,9 @@ public: void MarkUsedTemplateParameters(ArrayRef<TemplateArgument> TemplateArgs, unsigned Depth, llvm::SmallBitVector &Used); + void MarkUsedTemplateParameters(ArrayRef<TemplateArgumentLoc> TemplateArgs, + unsigned Depth, llvm::SmallBitVector &Used); + void MarkDeducedTemplateParameters(const FunctionTemplateDecl *FunctionTemplate, llvm::SmallBitVector &Deduced) { @@ -13096,6 +13120,9 @@ public: /// Whether we're substituting into constraints. bool InConstraintSubstitution; + /// Whether we're substituting into the parameter mapping of a constraint. + bool InParameterMappingSubstitution; + /// The point of instantiation or synthesis within the source code. SourceLocation PointOfInstantiation; @@ -13146,8 +13173,10 @@ public: CodeSynthesisContext() : Kind(TemplateInstantiation), SavedInNonInstantiationSFINAEContext(false), - InConstraintSubstitution(false), Entity(nullptr), Template(nullptr), - TemplateArgs(nullptr), NumTemplateArgs(0), DeductionInfo(nullptr) {} + InConstraintSubstitution(false), + InParameterMappingSubstitution(false), Entity(nullptr), + Template(nullptr), TemplateArgs(nullptr), NumTemplateArgs(0), + DeductionInfo(nullptr) {} /// Determines whether this template is an actual instantiation /// that should be counted toward the maximum instantiation depth. @@ -13359,6 +13388,11 @@ public: const MultiLevelTemplateArgumentList &TemplateArgs, TemplateArgumentListInfo &Outputs); + bool SubstTemplateArgumentsInParameterMapping( + ArrayRef<TemplateArgumentLoc> Args, SourceLocation BaseLoc, + const MultiLevelTemplateArgumentList &TemplateArgs, + TemplateArgumentListInfo &Out, bool BuildPackExpansionTypes); + /// Retrieve the template argument list(s) that should be used to /// instantiate the definition of the given declaration. /// @@ -13820,6 +13854,12 @@ public: CodeSynthesisContexts.back().InConstraintSubstitution; } + bool inParameterMappingSubstitution() const { + return !CodeSynthesisContexts.empty() && + CodeSynthesisContexts.back().InParameterMappingSubstitution && + !inConstraintSubstitution(); + } + using EntityPrinter = llvm::function_ref<void(llvm::raw_ostream &)>; /// \brief create a Requirement::SubstitutionDiagnostic with only a @@ -14704,6 +14744,10 @@ public: SatisfactionStack.swap(NewSS); } + using ConstrainedDeclOrNestedRequirement = + llvm::PointerUnion<const NamedDecl *, + const concepts::NestedRequirement *>; + /// Check whether the given expression is a valid constraint expression. /// A diagnostic is emitted if it is not, false is returned, and /// PossibleNonPrimary will be set to true if the failure might be due to a @@ -14728,44 +14772,12 @@ public: /// \returns true if an error occurred and satisfaction could not be checked, /// false otherwise. bool CheckConstraintSatisfaction( - const NamedDecl *Template, + ConstrainedDeclOrNestedRequirement Entity, ArrayRef<AssociatedConstraint> AssociatedConstraints, const MultiLevelTemplateArgumentList &TemplateArgLists, - SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction) { - llvm::SmallVector<Expr *, 4> Converted; - return CheckConstraintSatisfaction(Template, AssociatedConstraints, - Converted, TemplateArgLists, - TemplateIDRange, Satisfaction); - } - - /// \brief Check whether the given list of constraint expressions are - /// satisfied (as if in a 'conjunction') given template arguments. - /// Additionally, takes an empty list of Expressions which is populated with - /// the instantiated versions of the ConstraintExprs. - /// \param Template the template-like entity that triggered the constraints - /// check (either a concept or a constrained entity). - /// \param ConstraintExprs a list of constraint expressions, treated as if - /// they were 'AND'ed together. - /// \param ConvertedConstraints a out parameter that will get populated with - /// the instantiated version of the ConstraintExprs if we successfully checked - /// satisfaction. - /// \param TemplateArgList the multi-level list of template arguments to - /// substitute into the constraint expression. This should be relative to the - /// top-level (hence multi-level), since we need to instantiate fully at the - /// time of checking. - /// \param TemplateIDRange The source range of the template id that - /// caused the constraints check. - /// \param Satisfaction if true is returned, will contain details of the - /// satisfaction, with enough information to diagnose an unsatisfied - /// expression. - /// \returns true if an error occurred and satisfaction could not be checked, - /// false otherwise. - bool CheckConstraintSatisfaction( - const NamedDecl *Template, - ArrayRef<AssociatedConstraint> AssociatedConstraints, - llvm::SmallVectorImpl<Expr *> &ConvertedConstraints, - const MultiLevelTemplateArgumentList &TemplateArgList, - SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction); + SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction, + const ConceptReference *TopLevelConceptId = nullptr, + Expr **ConvertedExpr = nullptr); /// \brief Check whether the given non-dependent constraint expression is /// satisfied. Returns false and updates Satisfaction with the satisfaction @@ -14831,16 +14843,17 @@ public: /// \param First whether this is the first time an unsatisfied constraint is /// diagnosed for this error. void DiagnoseUnsatisfiedConstraint(const ConstraintSatisfaction &Satisfaction, + SourceLocation Loc = {}, bool First = true); /// \brief Emit diagnostics explaining why a constraint expression was deemed /// unsatisfied. void - DiagnoseUnsatisfiedConstraint(const ASTConstraintSatisfaction &Satisfaction, + DiagnoseUnsatisfiedConstraint(const ConceptSpecializationExpr *ConstraintExpr, bool First = true); const NormalizedConstraint *getNormalizedAssociatedConstraints( - const NamedDecl *ConstrainedDecl, + ConstrainedDeclOrNestedRequirement Entity, ArrayRef<AssociatedConstraint> AssociatedConstraints); /// \brief Check whether the given declaration's associated constraints are @@ -14865,6 +14878,15 @@ public: const NamedDecl *D1, ArrayRef<AssociatedConstraint> AC1, const NamedDecl *D2, ArrayRef<AssociatedConstraint> AC2); + /// Cache the satisfaction of an atomic constraint. + /// The key is based on the unsubstituted expression and the parameter + /// mapping. This lets us not substituting the mapping more than once, + /// which is (very!) expensive. + /// FIXME: this should be private. + llvm::DenseMap<llvm::FoldingSetNodeID, + UnsubstitutedConstraintSatisfactionCacheResult> + UnsubstitutedConstraintSatisfactionCache; + private: /// Caches pairs of template-like decls whose associated constraints were /// checked for subsumption and whether or not the first's constraints did in @@ -14875,8 +14897,11 @@ private: /// constrained declarations). If an error occurred while normalizing the /// associated constraints of the template or concept, nullptr will be cached /// here. - llvm::DenseMap<const NamedDecl *, NormalizedConstraint *> NormalizationCache; + llvm::DenseMap<ConstrainedDeclOrNestedRequirement, NormalizedConstraint *> + NormalizationCache; + /// Cache whether the associated constraint of a declaration + /// is satisfied. llvm::ContextualFoldingSet<ConstraintSatisfaction, const ASTContext &> SatisfactionCache; diff --git a/clang/include/clang/Sema/SemaConcept.h b/clang/include/clang/Sema/SemaConcept.h index 648a9c5..51ca1e1 100644 --- a/clang/include/clang/Sema/SemaConcept.h +++ b/clang/include/clang/Sema/SemaConcept.h @@ -16,130 +16,406 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/DeclTemplate.h" #include "clang/AST/Expr.h" +#include "clang/AST/ExprConcepts.h" #include "clang/Basic/SourceLocation.h" +#include "clang/Sema/Ownership.h" #include "llvm/ADT/FoldingSet.h" -#include "llvm/ADT/PointerUnion.h" #include "llvm/ADT/STLFunctionalExtras.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallVector.h" #include <optional> #include <utility> namespace clang { class Sema; +class MultiLevelTemplateArgumentList; -enum { ConstraintAlignment = 8 }; +/// \brief A normalized constraint, as defined in C++ [temp.constr.normal], is +/// either an atomic constraint, a conjunction of normalized constraints or a +/// disjunction of normalized constraints. +struct NormalizedConstraint { + + enum class ConstraintKind : unsigned char { + Atomic = 0, + ConceptId, + FoldExpanded, + Compound, + }; + + enum CompoundConstraintKind : unsigned char { + CCK_Conjunction, + CCK_Disjunction + }; + enum class FoldOperatorKind : unsigned char { And, Or }; + + using OccurenceList = llvm::SmallBitVector; + +protected: + using ExprOrConcept = + llvm::PointerUnion<const Expr *, const ConceptReference *>; + + struct AtomicConstraintBits { + // Kind is the first member of all union members, + // as we rely on their initial common sequence. + LLVM_PREFERRED_TYPE(ConstraintKind) + unsigned Kind : 5; + unsigned Placeholder : 1; + unsigned PackSubstitutionIndex : 26; + // Indexes, IndexesForSubsumption, and Args are part of the common initial + // sequences of constraints that do have a mapping. + + // Indexes of the parameters used in a constraint expression. + OccurenceList Indexes; + // Indexes of the parameters named directly in a constraint expression. + // FIXME: we should try to reduce the size of this struct? + OccurenceList IndexesForSubsumption; + + TemplateArgumentLoc *Args; + TemplateParameterList *ParamList; + ExprOrConcept ConstraintExpr; + const NamedDecl *ConstraintDecl; + }; + + struct FoldExpandedConstraintBits { + LLVM_PREFERRED_TYPE(ConstraintKind) + unsigned Kind : 5; + LLVM_PREFERRED_TYPE(FoldOperatorKind) + unsigned FoldOperator : 1; + unsigned Placeholder : 26; + OccurenceList Indexes; + OccurenceList IndexesForSubsumption; + TemplateArgumentLoc *Args; + TemplateParameterList *ParamList; + const Expr *Pattern; + const NamedDecl *ConstraintDecl; + NormalizedConstraint *Constraint; + }; + + struct ConceptIdBits : AtomicConstraintBits { + NormalizedConstraint *Sub; + + // Only used for parameter mapping. + const ConceptSpecializationExpr *CSE; + }; + + struct CompoundConstraintBits { + LLVM_PREFERRED_TYPE(ConstraintKind) + unsigned Kind : 5; + LLVM_PREFERRED_TYPE(CompoundConstraintKind) + unsigned CCK : 1; + NormalizedConstraint *LHS; + NormalizedConstraint *RHS; + }; + + union { + AtomicConstraintBits Atomic; + FoldExpandedConstraintBits FoldExpanded; + ConceptIdBits ConceptId; + CompoundConstraintBits Compound; + }; + + ~NormalizedConstraint() { + if (getKind() != ConstraintKind::Compound) + Atomic.Indexes.llvm::SmallBitVector::~SmallBitVector(); + } + + NormalizedConstraint(const Expr *ConstraintExpr, + const NamedDecl *ConstraintDecl, + UnsignedOrNone PackIndex) + : Atomic{llvm::to_underlying(ConstraintKind::Atomic), + /*Placeholder=*/0, + PackIndex.toInternalRepresentation(), + /*Indexes=*/{}, + /*IndexesForSubsumption=*/{}, + /*Args=*/nullptr, + /*ParamList=*/nullptr, + ConstraintExpr, + ConstraintDecl} {} + + NormalizedConstraint(const Expr *Pattern, FoldOperatorKind OpKind, + NormalizedConstraint *Constraint, + const NamedDecl *ConstraintDecl) + : FoldExpanded{llvm::to_underlying(ConstraintKind::FoldExpanded), + llvm::to_underlying(OpKind), + /*Placeholder=*/0, + /*Indexes=*/{}, + /*IndexesForSubsumption=*/{}, + /*Args=*/nullptr, + /*ParamList=*/nullptr, + Pattern, + ConstraintDecl, + Constraint} {} + + NormalizedConstraint(const ConceptReference *ConceptId, + const NamedDecl *ConstraintDecl, + NormalizedConstraint *SubConstraint, + const ConceptSpecializationExpr *CSE, + UnsignedOrNone PackIndex) + : ConceptId{{llvm::to_underlying(ConstraintKind::ConceptId), + /*Placeholder=*/0, PackIndex.toInternalRepresentation(), + /*Indexes=*/{}, + /*IndexesForSubsumption=*/{}, + /*Args=*/nullptr, /*ParamList=*/nullptr, ConceptId, + ConstraintDecl}, + SubConstraint, + CSE} {} + + NormalizedConstraint(NormalizedConstraint *LHS, CompoundConstraintKind CCK, + NormalizedConstraint *RHS) + : Compound{llvm::to_underlying(ConstraintKind::Compound), + llvm::to_underlying(CCK), LHS, RHS} {} + + bool hasParameterMapping() const { + // compound constraints do not have a mapping + // and Args is not part of their common initial sequence. + return getKind() != ConstraintKind::Compound && Atomic.Args != nullptr; + } + + const OccurenceList &mappingOccurenceList() const { + assert(hasParameterMapping() && "This constraint has no parameter mapping"); + return Atomic.Indexes; + } + + const OccurenceList &mappingOccurenceListForSubsumption() const { + assert(hasParameterMapping() && "This constraint has no parameter mapping"); + return Atomic.IndexesForSubsumption; + } -struct alignas(ConstraintAlignment) AtomicConstraint { - const Expr *ConstraintExpr; - const NamedDecl *ConstraintDecl; - std::optional<ArrayRef<TemplateArgumentLoc>> ParameterMapping; + llvm::MutableArrayRef<TemplateArgumentLoc> getParameterMapping() const { + return {Atomic.Args, Atomic.Indexes.count()}; + } + + TemplateParameterList *getUsedTemplateParamList() const { + return Atomic.ParamList; + } - AtomicConstraint(const Expr *ConstraintExpr, const NamedDecl *ConstraintDecl) - : ConstraintExpr(ConstraintExpr), ConstraintDecl(ConstraintDecl) {}; + void updateParameterMapping(OccurenceList Indexes, + OccurenceList IndexesForSubsumption, + llvm::MutableArrayRef<TemplateArgumentLoc> Args, + TemplateParameterList *ParamList) { + assert(getKind() != ConstraintKind::Compound); + assert(Indexes.count() == Args.size()); + assert(IndexesForSubsumption.size() == Indexes.size()); + assert((Indexes | IndexesForSubsumption) == Indexes); + + Atomic.IndexesForSubsumption = std::move(IndexesForSubsumption); + Atomic.Indexes = std::move(Indexes); + Atomic.Args = Args.data(); + Atomic.ParamList = ParamList; + } bool hasMatchingParameterMapping(ASTContext &C, - const AtomicConstraint &Other) const { - if (!ParameterMapping != !Other.ParameterMapping) + const NormalizedConstraint &Other) const { + assert(getKind() != ConstraintKind::Compound); + + if (hasParameterMapping() != Other.hasParameterMapping()) return false; - if (!ParameterMapping) + if (!hasParameterMapping()) return true; - if (ParameterMapping->size() != Other.ParameterMapping->size()) - return false; - for (unsigned I = 0, S = ParameterMapping->size(); I < S; ++I) { + llvm::ArrayRef<TemplateArgumentLoc> ParameterMapping = + getParameterMapping(); + llvm::ArrayRef<TemplateArgumentLoc> OtherParameterMapping = + Other.getParameterMapping(); + + const OccurenceList &Indexes = mappingOccurenceListForSubsumption(); + const OccurenceList &OtherIndexes = + Other.mappingOccurenceListForSubsumption(); + + if (ParameterMapping.size() != OtherParameterMapping.size()) + return false; + for (unsigned I = 0, S = ParameterMapping.size(); I < S; ++I) { + if (Indexes[I] != OtherIndexes[I]) + return false; + if (!Indexes[I]) + continue; llvm::FoldingSetNodeID IDA, IDB; - C.getCanonicalTemplateArgument((*ParameterMapping)[I].getArgument()) + C.getCanonicalTemplateArgument(ParameterMapping[I].getArgument()) .Profile(IDA, C); - C.getCanonicalTemplateArgument((*Other.ParameterMapping)[I].getArgument()) + C.getCanonicalTemplateArgument(OtherParameterMapping[I].getArgument()) .Profile(IDB, C); if (IDA != IDB) return false; } return true; } -}; -struct alignas(ConstraintAlignment) NormalizedConstraintPair; -struct alignas(ConstraintAlignment) FoldExpandedConstraint; +public: + ConstraintKind getKind() const { + return static_cast<ConstraintKind>(Atomic.Kind); + } -/// \brief A normalized constraint, as defined in C++ [temp.constr.normal], is -/// either an atomic constraint, a conjunction of normalized constraints or a -/// disjunction of normalized constraints. -struct NormalizedConstraint { + SourceLocation getBeginLoc() const { + switch (getKind()) { + case ConstraintKind::Atomic: + return cast<const Expr *>(Atomic.ConstraintExpr)->getBeginLoc(); + case ConstraintKind::ConceptId: + return cast<const ConceptReference *>(Atomic.ConstraintExpr) + ->getBeginLoc(); + case ConstraintKind::Compound: + return Compound.LHS->getBeginLoc(); + case ConstraintKind::FoldExpanded: + return FoldExpanded.Pattern->getBeginLoc(); + } + } + + SourceLocation getEndLoc() const { + switch (getKind()) { + case ConstraintKind::Atomic: + return cast<const Expr *>(Atomic.ConstraintExpr)->getEndLoc(); + case ConstraintKind::ConceptId: + return cast<const ConceptReference *>(Atomic.ConstraintExpr)->getEndLoc(); + case ConstraintKind::Compound: + return Compound.RHS->getEndLoc(); + case ConstraintKind::FoldExpanded: + return FoldExpanded.Pattern->getEndLoc(); + } + } + + SourceRange getSourceRange() const { return {getBeginLoc(), getEndLoc()}; } + +private: friend class Sema; + static NormalizedConstraint * + fromAssociatedConstraints(Sema &S, const NamedDecl *D, + ArrayRef<AssociatedConstraint> ACs); + static NormalizedConstraint *fromConstraintExpr(Sema &S, const NamedDecl *D, + const Expr *E, + UnsignedOrNone SubstIndex); +}; + +class CompoundConstraint : public NormalizedConstraint { + using NormalizedConstraint::NormalizedConstraint; - enum CompoundConstraintKind { CCK_Conjunction, CCK_Disjunction }; +public: + static CompoundConstraint *Create(ASTContext &Ctx, NormalizedConstraint *LHS, + CompoundConstraintKind CCK, + NormalizedConstraint *RHS) { + return new (Ctx) CompoundConstraint(LHS, CCK, RHS); + } - using CompoundConstraint = llvm::PointerIntPair<NormalizedConstraintPair *, 1, - CompoundConstraintKind>; + static CompoundConstraint *CreateConjunction(ASTContext &Ctx, + NormalizedConstraint *LHS, + NormalizedConstraint *RHS) { + return new (Ctx) CompoundConstraint(LHS, CCK_Conjunction, RHS); + } - llvm::PointerUnion<AtomicConstraint *, FoldExpandedConstraint *, - CompoundConstraint> - Constraint; + const NormalizedConstraint &getLHS() const { return *Compound.LHS; } - NormalizedConstraint(AtomicConstraint *C): Constraint{C} { }; - NormalizedConstraint(FoldExpandedConstraint *C) : Constraint{C} {}; + NormalizedConstraint &getLHS() { return *Compound.LHS; } - NormalizedConstraint(ASTContext &C, NormalizedConstraint LHS, - NormalizedConstraint RHS, CompoundConstraintKind Kind); + const NormalizedConstraint &getRHS() const { return *Compound.RHS; } - NormalizedConstraint(ASTContext &C, const NormalizedConstraint &Other); - NormalizedConstraint(NormalizedConstraint &&Other): - Constraint(Other.Constraint) { - Other.Constraint = nullptr; + NormalizedConstraint &getRHS() { return *Compound.RHS; } + + CompoundConstraintKind getCompoundKind() const { + return static_cast<CompoundConstraintKind>(Compound.CCK); } - NormalizedConstraint &operator=(const NormalizedConstraint &Other) = delete; - NormalizedConstraint &operator=(NormalizedConstraint &&Other) { - if (&Other != this) { - NormalizedConstraint Temp(std::move(Other)); - std::swap(Constraint, Temp.Constraint); - } - return *this; +}; + +class NormalizedConstraintWithParamMapping : public NormalizedConstraint { +protected: + using NormalizedConstraint::NormalizedConstraint; + +public: + using NormalizedConstraint::getParameterMapping; + using NormalizedConstraint::getUsedTemplateParamList; + using NormalizedConstraint::hasMatchingParameterMapping; + using NormalizedConstraint::hasParameterMapping; + using NormalizedConstraint::mappingOccurenceList; + using NormalizedConstraint::mappingOccurenceListForSubsumption; + using NormalizedConstraint::updateParameterMapping; + + const NamedDecl *getConstraintDecl() const { return Atomic.ConstraintDecl; } + + UnsignedOrNone getPackSubstitutionIndex() const { + return UnsignedOrNone::fromInternalRepresentation( + Atomic.PackSubstitutionIndex); } +}; + +class AtomicConstraint : public NormalizedConstraintWithParamMapping { + using NormalizedConstraintWithParamMapping:: + NormalizedConstraintWithParamMapping; - bool isAtomic() const { return llvm::isa<AtomicConstraint *>(Constraint); } - bool isFoldExpanded() const { - return llvm::isa<FoldExpandedConstraint *>(Constraint); +public: + static AtomicConstraint *Create(ASTContext &Ctx, const Expr *ConstraintExpr, + const NamedDecl *ConstraintDecl, + UnsignedOrNone PackIndex) { + return new (Ctx) + AtomicConstraint(ConstraintExpr, ConstraintDecl, PackIndex); } - bool isCompound() const { return llvm::isa<CompoundConstraint>(Constraint); } - CompoundConstraintKind getCompoundKind() const; + const Expr *getConstraintExpr() const { + return cast<const Expr *>(Atomic.ConstraintExpr); + } +}; - NormalizedConstraint &getLHS() const; - NormalizedConstraint &getRHS() const; +class FoldExpandedConstraint : public NormalizedConstraintWithParamMapping { + using NormalizedConstraintWithParamMapping:: + NormalizedConstraintWithParamMapping; - AtomicConstraint *getAtomicConstraint() const; +public: + static FoldExpandedConstraint *Create(ASTContext &Ctx, const Expr *Pattern, + const NamedDecl *ConstraintDecl, + FoldOperatorKind OpKind, + NormalizedConstraint *Constraint) { + return new (Ctx) + FoldExpandedConstraint(Pattern, OpKind, Constraint, ConstraintDecl); + } - FoldExpandedConstraint *getFoldExpandedConstraint() const; + using NormalizedConstraint::hasMatchingParameterMapping; -private: - static std::optional<NormalizedConstraint> - fromAssociatedConstraints(Sema &S, const NamedDecl *D, - ArrayRef<AssociatedConstraint> ACs); - static std::optional<NormalizedConstraint> - fromConstraintExpr(Sema &S, const NamedDecl *D, const Expr *E); -}; + FoldOperatorKind getFoldOperator() const { + return static_cast<FoldOperatorKind>(FoldExpanded.FoldOperator); + } -struct alignas(ConstraintAlignment) NormalizedConstraintPair { - NormalizedConstraint LHS, RHS; -}; + const Expr *getPattern() const { return FoldExpanded.Pattern; } -struct alignas(ConstraintAlignment) FoldExpandedConstraint { - enum class FoldOperatorKind { And, Or } Kind; - NormalizedConstraint Constraint; - const Expr *Pattern; + const NormalizedConstraint &getNormalizedPattern() const { + return *FoldExpanded.Constraint; + } - FoldExpandedConstraint(FoldOperatorKind K, NormalizedConstraint C, - const Expr *Pattern) - : Kind(K), Constraint(std::move(C)), Pattern(Pattern) {}; + NormalizedConstraint &getNormalizedPattern() { + return *FoldExpanded.Constraint; + } static bool AreCompatibleForSubsumption(const FoldExpandedConstraint &A, const FoldExpandedConstraint &B); }; -const NormalizedConstraint *getNormalizedAssociatedConstraints( - Sema &S, const NamedDecl *ConstrainedDecl, - ArrayRef<AssociatedConstraint> AssociatedConstraints); +class ConceptIdConstraint : public NormalizedConstraintWithParamMapping { + using NormalizedConstraintWithParamMapping:: + NormalizedConstraintWithParamMapping; + +public: + static ConceptIdConstraint * + Create(ASTContext &Ctx, const ConceptReference *ConceptId, + NormalizedConstraint *SubConstraint, const NamedDecl *ConstraintDecl, + const ConceptSpecializationExpr *CSE, UnsignedOrNone PackIndex) { + return new (Ctx) ConceptIdConstraint(ConceptId, ConstraintDecl, + SubConstraint, CSE, PackIndex); + } + + const ConceptSpecializationExpr *getConceptSpecializationExpr() const { + return ConceptId.CSE; + } + + const ConceptReference *getConceptId() const { + return cast<const ConceptReference *>(ConceptId.ConstraintExpr); + } + + const NormalizedConstraint &getNormalizedConstraint() const { + return *ConceptId.Sub; + } + + NormalizedConstraint &getNormalizedConstraint() { return *ConceptId.Sub; } +}; + +struct UnsubstitutedConstraintSatisfactionCacheResult { + ExprResult SubstExpr; + ConstraintSatisfaction Satisfaction; +}; /// \brief SubsumptionChecker establishes subsumption /// between two set of constraints. @@ -189,13 +465,13 @@ private: }; struct MappedAtomicConstraint { - AtomicConstraint *Constraint; + const AtomicConstraint *Constraint; Literal ID; }; struct FoldExpendedConstraintKey { FoldExpandedConstraint::FoldOperatorKind Kind; - AtomicConstraint *Constraint; + const AtomicConstraint *Constraint; Literal ID; }; @@ -207,7 +483,7 @@ private: // A map from a literal to a corresponding associated constraint. // We do not have enough bits left for a pointer union here :( - llvm::DenseMap<uint16_t, void *> ReverseMap; + llvm::DenseMap<uint16_t, const void *> ReverseMap; // Fold expanded constraints ask us to recursively establish subsumption. // This caches the result. @@ -234,12 +510,12 @@ private: FormulaType Normalize(const NormalizedConstraint &C); void AddUniqueClauseToFormula(Formula &F, Clause C); - Literal find(AtomicConstraint *); - Literal find(FoldExpandedConstraint *); + Literal find(const AtomicConstraint *); + Literal find(const FoldExpandedConstraint *); uint16_t getNewLiteralId(); }; -} // clang +} // namespace clang #endif // LLVM_CLANG_SEMA_SEMACONCEPT_H diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h index 115c19d..60c7d27 100644 --- a/clang/include/clang/Sema/Template.h +++ b/clang/include/clang/Sema/Template.h @@ -234,21 +234,25 @@ enum class TemplateSubstitutionKind : char { /// Replaces the current 'innermost' level with the provided argument list. /// This is useful for type deduction cases where we need to get the entire /// list from the AST, but then add the deduced innermost list. - void replaceInnermostTemplateArguments(Decl *AssociatedDecl, ArgList Args) { + void replaceInnermostTemplateArguments(Decl *AssociatedDecl, ArgList Args, + bool Final = false) { assert((!TemplateArgumentLists.empty() || NumRetainedOuterLevels) && "Replacing in an empty list?"); if (!TemplateArgumentLists.empty()) { - assert((TemplateArgumentLists[0].AssociatedDeclAndFinal.getPointer() || - TemplateArgumentLists[0].AssociatedDeclAndFinal.getPointer() == - AssociatedDecl) && - "Trying to change incorrect declaration?"); TemplateArgumentLists[0].Args = Args; - } else { - --NumRetainedOuterLevels; - TemplateArgumentLists.push_back( - {{AssociatedDecl, /*Final=*/false}, Args}); + return; } + --NumRetainedOuterLevels; + TemplateArgumentLists.push_back( + {{AssociatedDecl, /*Final=*/Final}, Args}); + } + + void replaceOutermostTemplateArguments(Decl *AssociatedDecl, ArgList Args) { + assert((!TemplateArgumentLists.empty()) && "Replacing in an empty list?"); + TemplateArgumentLists.back().AssociatedDeclAndFinal.setPointer( + AssociatedDecl); + TemplateArgumentLists.back().Args = Args; } /// Add an outermost level that we are not substituting. We have no diff --git a/clang/lib/AST/ASTConcept.cpp b/clang/lib/AST/ASTConcept.cpp index d658890..fd12bc4 100644 --- a/clang/lib/AST/ASTConcept.cpp +++ b/clang/lib/AST/ASTConcept.cpp @@ -24,13 +24,18 @@ static void CreateUnsatisfiedConstraintRecord(const ASTContext &C, const UnsatisfiedConstraintRecord &Detail, UnsatisfiedConstraintRecord *TrailingObject) { - if (auto *E = dyn_cast<Expr *>(Detail)) + if (Detail.isNull()) + new (TrailingObject) UnsatisfiedConstraintRecord(nullptr); + else if (const auto *E = llvm::dyn_cast<const Expr *>(Detail)) new (TrailingObject) UnsatisfiedConstraintRecord(E); + else if (const auto *Concept = + llvm::dyn_cast<const ConceptReference *>(Detail)) + new (TrailingObject) UnsatisfiedConstraintRecord(Concept); else { auto &SubstitutionDiagnostic = - *cast<std::pair<SourceLocation, StringRef> *>(Detail); + *cast<const clang::ConstraintSubstitutionDiagnostic *>(Detail); StringRef Message = C.backupStr(SubstitutionDiagnostic.second); - auto *NewSubstDiag = new (C) std::pair<SourceLocation, StringRef>( + auto *NewSubstDiag = new (C) clang::ConstraintSubstitutionDiagnostic( SubstitutionDiagnostic.first, Message); new (TrailingObject) UnsatisfiedConstraintRecord(NewSubstDiag); } @@ -74,9 +79,10 @@ ASTConstraintSatisfaction *ASTConstraintSatisfaction::Rebuild( return new (Mem) ASTConstraintSatisfaction(C, Satisfaction); } -void ConstraintSatisfaction::Profile( - llvm::FoldingSetNodeID &ID, const ASTContext &C, - const NamedDecl *ConstraintOwner, ArrayRef<TemplateArgument> TemplateArgs) { +void ConstraintSatisfaction::Profile(llvm::FoldingSetNodeID &ID, + const ASTContext &C, + const NamedDecl *ConstraintOwner, + ArrayRef<TemplateArgument> TemplateArgs) { ID.AddPointer(ConstraintOwner); ID.AddInteger(TemplateArgs.size()); for (auto &Arg : TemplateArgs) @@ -116,6 +122,19 @@ void ConceptReference::print(llvm::raw_ostream &OS, } } +const StreamingDiagnostic &clang::operator<<(const StreamingDiagnostic &DB, + const ConceptReference *C) { + std::string NameStr; + llvm::raw_string_ostream OS(NameStr); + LangOptions LO; + LO.CPlusPlus = true; + LO.Bool = true; + OS << '\''; + C->print(OS, PrintingPolicy(LO)); + OS << '\''; + return DB << NameStr; +} + concepts::ExprRequirement::ExprRequirement( Expr *E, bool IsSimple, SourceLocation NoexceptLoc, ReturnTypeRequirement Req, SatisfactionStatus Status, diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 1c8fd83..f43fa8c 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -1069,22 +1069,22 @@ Error ASTNodeImporter::ImportConstraintSatisfaction( ToSat.ContainsErrors = FromSat.ContainsErrors; if (!ToSat.IsSatisfied) { for (auto Record = FromSat.begin(); Record != FromSat.end(); ++Record) { - if (Expr *E = Record->dyn_cast<Expr *>()) { + if (const Expr *E = Record->dyn_cast<const Expr *>()) { ExpectedExpr ToSecondExpr = import(E); if (!ToSecondExpr) return ToSecondExpr.takeError(); ToSat.Details.emplace_back(ToSecondExpr.get()); } else { - auto Pair = Record->dyn_cast<std::pair<SourceLocation, StringRef> *>(); + auto Pair = + Record->dyn_cast<const ConstraintSubstitutionDiagnostic *>(); ExpectedSLoc ToPairFirst = import(Pair->first); if (!ToPairFirst) return ToPairFirst.takeError(); StringRef ToPairSecond = ImportASTStringRef(Pair->second); - ToSat.Details.emplace_back( - new (Importer.getToContext()) - ConstraintSatisfaction::SubstitutionDiagnostic{ - ToPairFirst.get(), ToPairSecond}); + ToSat.Details.emplace_back(new (Importer.getToContext()) + ConstraintSubstitutionDiagnostic{ + ToPairFirst.get(), ToPairSecond}); } } } diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index dc6d232..8413090 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -12,9 +12,11 @@ #include "clang/Sema/SemaConcept.h" #include "TreeTransform.h" +#include "clang/AST/ASTConcept.h" #include "clang/AST/ASTLambda.h" #include "clang/AST/DeclCXX.h" #include "clang/AST/ExprConcepts.h" +#include "clang/AST/RecursiveASTVisitor.h" #include "clang/Basic/OperatorPrecedence.h" #include "clang/Sema/EnterExpressionEvaluationContext.h" #include "clang/Sema/Initialization.h" @@ -27,7 +29,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PointerUnion.h" #include "llvm/ADT/StringExtras.h" -#include <optional> +#include "llvm/Support/SaveAndRestore.h" using namespace clang; using namespace sema; @@ -85,7 +87,7 @@ public: OK_Ordinary, Loc, FPOptionsOverride{}); } }; -} +} // namespace bool Sema::CheckConstraintExpression(const Expr *ConstraintExpression, Token NextToken, bool *PossibleNonPrimary, @@ -146,14 +148,14 @@ bool Sema::CheckConstraintExpression(const Expr *ConstraintExpression, if (!Context.hasSameUnqualifiedType(Type, Context.BoolTy)) { Diag(ConstraintExpression->getExprLoc(), - diag::err_non_bool_atomic_constraint) << Type - << ConstraintExpression->getSourceRange(); + diag::err_non_bool_atomic_constraint) + << Type << ConstraintExpression->getSourceRange(); CheckForNonPrimary(); return false; } if (PossibleNonPrimary) - *PossibleNonPrimary = false; + *PossibleNonPrimary = false; return true; } @@ -164,52 +166,315 @@ struct SatisfactionStackRAII { SatisfactionStackRAII(Sema &SemaRef, const NamedDecl *ND, const llvm::FoldingSetNodeID &FSNID) : SemaRef(SemaRef) { - if (ND) { + if (ND) { SemaRef.PushSatisfactionStackEntry(ND, FSNID); Inserted = true; - } + } } ~SatisfactionStackRAII() { - if (Inserted) - SemaRef.PopSatisfactionStackEntry(); + if (Inserted) + SemaRef.PopSatisfactionStackEntry(); } }; } // namespace -static bool -DiagRecursiveConstraintEval(Sema &S, llvm::FoldingSetNodeID &ID, - const NamedDecl *Templ, const Expr *E, - const MultiLevelTemplateArgumentList &MLTAL) { +static bool DiagRecursiveConstraintEval( + Sema &S, llvm::FoldingSetNodeID &ID, const NamedDecl *Templ, const Expr *E, + const MultiLevelTemplateArgumentList *MLTAL = nullptr) { E->Profile(ID, S.Context, /*Canonical=*/true); - for (const auto &List : MLTAL) - for (const auto &TemplateArg : List.Args) - TemplateArg.Profile(ID, S.Context); - - // Note that we have to do this with our own collection, because there are - // times where a constraint-expression check can cause us to need to evaluate - // other constriants that are unrelated, such as when evaluating a recovery - // expression, or when trying to determine the constexpr-ness of special - // members. Otherwise we could just use the - // Sema::InstantiatingTemplate::isAlreadyBeingInstantiated function. + if (MLTAL) { + for (const auto &List : *MLTAL) + for (const auto &TemplateArg : List.Args) + S.Context.getCanonicalTemplateArgument(TemplateArg) + .Profile(ID, S.Context); + } if (S.SatisfactionStackContains(Templ, ID)) { S.Diag(E->getExprLoc(), diag::err_constraint_depends_on_self) << E << E->getSourceRange(); return true; } - return false; } -static ExprResult EvaluateAtomicConstraint( - Sema &S, const Expr *AtomicExpr, const NamedDecl *Template, - SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL, - ConstraintSatisfaction &Satisfaction) { +// Figure out the to-translation-unit depth for this function declaration for +// the purpose of seeing if they differ by constraints. This isn't the same as +// getTemplateDepth, because it includes already instantiated parents. +static unsigned +CalculateTemplateDepthForConstraints(Sema &S, const NamedDecl *ND, + bool SkipForSpecialization = false) { + MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs( + ND, ND->getLexicalDeclContext(), /*Final=*/false, + /*Innermost=*/std::nullopt, + /*RelativeToPrimary=*/true, + /*Pattern=*/nullptr, + /*ForConstraintInstantiation=*/true, SkipForSpecialization); + return MLTAL.getNumLevels(); +} + +namespace { +class AdjustConstraintDepth : public TreeTransform<AdjustConstraintDepth> { + unsigned TemplateDepth = 0; + +public: + using inherited = TreeTransform<AdjustConstraintDepth>; + AdjustConstraintDepth(Sema &SemaRef, unsigned TemplateDepth) + : inherited(SemaRef), TemplateDepth(TemplateDepth) {} + + using inherited::TransformTemplateTypeParmType; + QualType TransformTemplateTypeParmType(TypeLocBuilder &TLB, + TemplateTypeParmTypeLoc TL, bool) { + const TemplateTypeParmType *T = TL.getTypePtr(); + + TemplateTypeParmDecl *NewTTPDecl = nullptr; + if (TemplateTypeParmDecl *OldTTPDecl = T->getDecl()) + NewTTPDecl = cast_or_null<TemplateTypeParmDecl>( + TransformDecl(TL.getNameLoc(), OldTTPDecl)); + + QualType Result = getSema().Context.getTemplateTypeParmType( + T->getDepth() + TemplateDepth, T->getIndex(), T->isParameterPack(), + NewTTPDecl); + TemplateTypeParmTypeLoc NewTL = TLB.push<TemplateTypeParmTypeLoc>(Result); + NewTL.setNameLoc(TL.getNameLoc()); + return Result; + } + + bool AlreadyTransformed(QualType T) { + if (T.isNull()) + return true; + + if (T->isInstantiationDependentType() || T->isVariablyModifiedType() || + T->containsUnexpandedParameterPack()) + return false; + return true; + } +}; +} // namespace + +namespace { + +// FIXME: Convert it to DynamicRecursiveASTVisitor +class HashParameterMapping : public RecursiveASTVisitor<HashParameterMapping> { + using inherited = RecursiveASTVisitor<HashParameterMapping>; + friend inherited; + + Sema &SemaRef; + const MultiLevelTemplateArgumentList &TemplateArgs; + llvm::FoldingSetNodeID &ID; + llvm::SmallVector<TemplateArgument, 10> UsedTemplateArgs; + + UnsignedOrNone OuterPackSubstIndex; + + TemplateArgument getPackSubstitutedTemplateArgument(TemplateArgument Arg) { + assert(*SemaRef.ArgPackSubstIndex < Arg.pack_size()); + Arg = Arg.pack_begin()[*SemaRef.ArgPackSubstIndex]; + if (Arg.isPackExpansion()) + Arg = Arg.getPackExpansionPattern(); + return Arg; + } + + bool shouldVisitTemplateInstantiations() const { return true; } + +public: + HashParameterMapping(Sema &SemaRef, + const MultiLevelTemplateArgumentList &TemplateArgs, + llvm::FoldingSetNodeID &ID, + UnsignedOrNone OuterPackSubstIndex) + : SemaRef(SemaRef), TemplateArgs(TemplateArgs), ID(ID), + OuterPackSubstIndex(OuterPackSubstIndex) {} + + bool VisitTemplateTypeParmType(TemplateTypeParmType *T) { + // A lambda expression can introduce template parameters that don't have + // corresponding template arguments yet. + if (T->getDepth() >= TemplateArgs.getNumLevels()) + return true; + + TemplateArgument Arg = TemplateArgs(T->getDepth(), T->getIndex()); + + if (T->isParameterPack() && SemaRef.ArgPackSubstIndex) { + assert(Arg.getKind() == TemplateArgument::Pack && + "Missing argument pack"); + + Arg = getPackSubstitutedTemplateArgument(Arg); + } + + UsedTemplateArgs.push_back( + SemaRef.Context.getCanonicalTemplateArgument(Arg)); + return true; + } + + bool VisitDeclRefExpr(DeclRefExpr *E) { + NamedDecl *D = E->getDecl(); + NonTypeTemplateParmDecl *NTTP = dyn_cast<NonTypeTemplateParmDecl>(D); + if (!NTTP) + return TraverseDecl(D); + + TemplateArgument Arg = TemplateArgs(NTTP->getDepth(), NTTP->getPosition()); + if (NTTP->isParameterPack() && SemaRef.ArgPackSubstIndex) { + assert(Arg.getKind() == TemplateArgument::Pack && + "Missing argument pack"); + Arg = getPackSubstitutedTemplateArgument(Arg); + } + + UsedTemplateArgs.push_back( + SemaRef.Context.getCanonicalTemplateArgument(Arg)); + return true; + } + + bool VisitTypedefType(TypedefType *TT) { + return inherited::TraverseType(TT->desugar()); + } + + bool TraverseDecl(Decl *D) { + if (auto *VD = dyn_cast<ValueDecl>(D)) + return TraverseType(VD->getType()); + + return inherited::TraverseDecl(D); + } + + bool TraverseTypeLoc(TypeLoc TL, bool TraverseQualifier = true) { + // We don't care about TypeLocs. So traverse Types instead. + return TraverseType(TL.getType(), TraverseQualifier); + } + + bool TraverseTagType(const TagType *T, bool TraverseQualifier) { + // T's parent can be dependent while T doesn't have any template arguments. + // We should have already traversed its qualifier. + // FIXME: Add an assert to catch cases where we failed to profile the + // concept. assert(!T->isDependentType() && "We missed a case in profiling + // concepts!"); + return true; + } + + bool TraverseInjectedClassNameType(InjectedClassNameType *T, + bool TraverseQualifier) { + return TraverseTemplateArguments(T->getTemplateArgs(SemaRef.Context)); + } + + bool TraverseTemplateArgument(const TemplateArgument &Arg) { + if (!Arg.containsUnexpandedParameterPack() || Arg.isPackExpansion()) { + // Act as if we are fully expanding this pack, if it is a PackExpansion. + Sema::ArgPackSubstIndexRAII _1(SemaRef, std::nullopt); + llvm::SaveAndRestore<UnsignedOrNone> _2(OuterPackSubstIndex, + std::nullopt); + return inherited::TraverseTemplateArgument(Arg); + } + + Sema::ArgPackSubstIndexRAII _1(SemaRef, OuterPackSubstIndex); + return inherited::TraverseTemplateArgument(Arg); + } + + void VisitConstraint(const NormalizedConstraintWithParamMapping &Constraint) { + if (!Constraint.hasParameterMapping()) { + for (const auto &List : TemplateArgs) + for (const TemplateArgument &Arg : List.Args) + SemaRef.Context.getCanonicalTemplateArgument(Arg).Profile( + ID, SemaRef.Context); + return; + } + + llvm::ArrayRef<TemplateArgumentLoc> Mapping = + Constraint.getParameterMapping(); + for (auto &ArgLoc : Mapping) { + TemplateArgument Canonical = + SemaRef.Context.getCanonicalTemplateArgument(ArgLoc.getArgument()); + // We don't want sugars to impede the profile of cache. + UsedTemplateArgs.push_back(Canonical); + TraverseTemplateArgument(Canonical); + } + + for (auto &Used : UsedTemplateArgs) { + llvm::FoldingSetNodeID R; + Used.Profile(R, SemaRef.Context); + ID.AddNodeID(R); + } + } +}; + +class ConstraintSatisfactionChecker { + Sema &S; + const NamedDecl *Template; + SourceLocation TemplateNameLoc; + UnsignedOrNone PackSubstitutionIndex; + + ConstraintSatisfaction &Satisfaction; + +private: + ExprResult + EvaluateAtomicConstraint(const Expr *AtomicExpr, + const MultiLevelTemplateArgumentList &MLTAL); + + UnsignedOrNone EvaluateFoldExpandedConstraintSize( + const FoldExpandedConstraint &FE, + const MultiLevelTemplateArgumentList &MLTAL); + + // XXX: It is SLOW! Use it very carefully. + std::optional<MultiLevelTemplateArgumentList> SubstitutionInTemplateArguments( + const NormalizedConstraintWithParamMapping &Constraint, + MultiLevelTemplateArgumentList MLTAL, + llvm::SmallVector<TemplateArgument> &SubstitutedOuterMost); + + ExprResult EvaluateSlow(const AtomicConstraint &Constraint, + const MultiLevelTemplateArgumentList &MLTAL); + + ExprResult Evaluate(const AtomicConstraint &Constraint, + const MultiLevelTemplateArgumentList &MLTAL); + + ExprResult EvaluateSlow(const FoldExpandedConstraint &Constraint, + const MultiLevelTemplateArgumentList &MLTAL); + + ExprResult Evaluate(const FoldExpandedConstraint &Constraint, + const MultiLevelTemplateArgumentList &MLTAL); + + ExprResult EvaluateSlow(const ConceptIdConstraint &Constraint, + const MultiLevelTemplateArgumentList &MLTAL, + unsigned int Size); + + ExprResult Evaluate(const ConceptIdConstraint &Constraint, + const MultiLevelTemplateArgumentList &MLTAL); + + ExprResult Evaluate(const CompoundConstraint &Constraint, + const MultiLevelTemplateArgumentList &MLTAL); + +public: + ConstraintSatisfactionChecker(Sema &SemaRef, const NamedDecl *Template, + SourceLocation TemplateNameLoc, + UnsignedOrNone PackSubstitutionIndex, + ConstraintSatisfaction &Satisfaction) + : S(SemaRef), Template(Template), TemplateNameLoc(TemplateNameLoc), + PackSubstitutionIndex(PackSubstitutionIndex), + Satisfaction(Satisfaction) {} + + ExprResult Evaluate(const NormalizedConstraint &Constraint, + const MultiLevelTemplateArgumentList &MLTAL); +}; + +StringRef allocateStringFromConceptDiagnostic(const Sema &S, + const PartialDiagnostic Diag) { + SmallString<128> DiagString; + DiagString = ": "; + Diag.EmitToString(S.getDiagnostics(), DiagString); + return S.getASTContext().backupStr(DiagString); +} + +} // namespace + +ExprResult ConstraintSatisfactionChecker::EvaluateAtomicConstraint( + const Expr *AtomicExpr, const MultiLevelTemplateArgumentList &MLTAL) { EnterExpressionEvaluationContext ConstantEvaluated( S, Sema::ExpressionEvaluationContext::ConstantEvaluated, Sema::ReuseLambdaContextDecl); + llvm::FoldingSetNodeID ID; + if (Template && + DiagRecursiveConstraintEval(S, ID, Template, AtomicExpr, &MLTAL)) { + Satisfaction.IsSatisfied = false; + Satisfaction.ContainsErrors = true; + return ExprEmpty(); + } + SatisfactionStackRAII StackRAII(S, Template, ID); + // Atomic constraint - substitute arguments and check satisfaction. - ExprResult SubstitutedExpression; + ExprResult SubstitutedExpression = const_cast<Expr *>(AtomicExpr); { TemplateDeductionInfo Info(TemplateNameLoc); Sema::InstantiatingTemplate Inst( @@ -220,16 +485,6 @@ static ExprResult EvaluateAtomicConstraint( if (Inst.isInvalid()) return ExprError(); - llvm::FoldingSetNodeID ID; - if (Template && - DiagRecursiveConstraintEval(S, ID, Template, AtomicExpr, MLTAL)) { - Satisfaction.IsSatisfied = false; - Satisfaction.ContainsErrors = true; - return ExprEmpty(); - } - - SatisfactionStackRAII StackRAII(S, Template, ID); - // We do not want error diagnostics escaping here. Sema::SFINAETrap Trap(S); SubstitutedExpression = @@ -247,21 +502,16 @@ static ExprResult EvaluateAtomicConstraint( PartialDiagnosticAt SubstDiag{SourceLocation(), PartialDiagnostic::NullDiagnostic()}; Info.takeSFINAEDiagnostic(SubstDiag); - // FIXME: Concepts: This is an unfortunate consequence of there + // FIXME: This is an unfortunate consequence of there // being no serialization code for PartialDiagnostics and the fact // that serializing them would likely take a lot more storage than // just storing them as strings. We would still like, in the // future, to serialize the proper PartialDiagnostic as serializing // it as a string defeats the purpose of the diagnostic mechanism. - SmallString<128> DiagString; - DiagString = ": "; - SubstDiag.second.EmitToString(S.getDiagnostics(), DiagString); - unsigned MessageSize = DiagString.size(); - char *Mem = new (S.Context) char[MessageSize]; - memcpy(Mem, DiagString.c_str(), MessageSize); Satisfaction.Details.emplace_back( - new (S.Context) ConstraintSatisfaction::SubstitutionDiagnostic{ - SubstDiag.first, StringRef(Mem, MessageSize)}); + new (S.Context) ConstraintSubstitutionDiagnostic{ + SubstDiag.first, + allocateStringFromConceptDiagnostic(S, SubstDiag.second)}); Satisfaction.IsSatisfied = false; return ExprEmpty(); } @@ -289,216 +539,94 @@ static ExprResult EvaluateAtomicConstraint( return SubstitutedExpression; } -static UnsignedOrNone EvaluateFoldExpandedConstraintSize( - Sema &S, const CXXFoldExpr *FE, const NamedDecl *Template, - SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL, - ConstraintSatisfaction &Satisfaction) { - - // We should ignore errors in the presence of packs of different size. - Sema::SFINAETrap Trap(S); - - Expr *Pattern = FE->getPattern(); +std::optional<MultiLevelTemplateArgumentList> +ConstraintSatisfactionChecker::SubstitutionInTemplateArguments( + const NormalizedConstraintWithParamMapping &Constraint, + MultiLevelTemplateArgumentList MLTAL, + llvm::SmallVector<TemplateArgument> &SubstitutedOuterMost) { - SmallVector<UnexpandedParameterPack, 2> Unexpanded; - S.collectUnexpandedParameterPacks(Pattern, Unexpanded); - assert(!Unexpanded.empty() && "Pack expansion without parameter packs?"); - bool Expand = true; - bool RetainExpansion = false; - UnsignedOrNone NumExpansions = FE->getNumExpansions(); - if (S.CheckParameterPacksForExpansion( - FE->getEllipsisLoc(), Pattern->getSourceRange(), Unexpanded, MLTAL, - /*FailOnPackProducingTemplates=*/true, Expand, RetainExpansion, - NumExpansions) || - !Expand || RetainExpansion) - return std::nullopt; + if (!Constraint.hasParameterMapping()) + return std::move(MLTAL); - if (NumExpansions && S.getLangOpts().BracketDepth < *NumExpansions) { - S.Diag(FE->getEllipsisLoc(), - clang::diag::err_fold_expression_limit_exceeded) - << *NumExpansions << S.getLangOpts().BracketDepth - << FE->getSourceRange(); - S.Diag(FE->getEllipsisLoc(), diag::note_bracket_depth); + TemplateDeductionInfo Info(Constraint.getBeginLoc()); + Sema::InstantiatingTemplate Inst( + S, Constraint.getBeginLoc(), + Sema::InstantiatingTemplate::ConstraintSubstitution{}, + // FIXME: improve const-correctness of InstantiatingTemplate + const_cast<NamedDecl *>(Template), Info, Constraint.getSourceRange()); + if (Inst.isInvalid()) return std::nullopt; - } - return NumExpansions; -} - -static ExprResult calculateConstraintSatisfaction( - Sema &S, const Expr *ConstraintExpr, const NamedDecl *Template, - SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL, - ConstraintSatisfaction &Satisfaction); - -static ExprResult calculateConstraintSatisfaction( - Sema &S, const Expr *LHS, OverloadedOperatorKind Op, const Expr *RHS, - const NamedDecl *Template, SourceLocation TemplateNameLoc, - const MultiLevelTemplateArgumentList &MLTAL, - ConstraintSatisfaction &Satisfaction) { - size_t EffectiveDetailEndIndex = Satisfaction.Details.size(); - - ExprResult LHSRes = calculateConstraintSatisfaction( - S, LHS, Template, TemplateNameLoc, MLTAL, Satisfaction); - - if (LHSRes.isInvalid()) - return ExprError(); - - bool IsLHSSatisfied = Satisfaction.IsSatisfied; - - if (Op == clang::OO_PipePipe && IsLHSSatisfied) - // [temp.constr.op] p3 - // A disjunction is a constraint taking two operands. To determine if - // a disjunction is satisfied, the satisfaction of the first operand - // is checked. If that is satisfied, the disjunction is satisfied. - // Otherwise, the disjunction is satisfied if and only if the second - // operand is satisfied. - // LHS is instantiated while RHS is not. Skip creating invalid BinaryOp. - return LHSRes; - - if (Op == clang::OO_AmpAmp && !IsLHSSatisfied) - // [temp.constr.op] p2 - // A conjunction is a constraint taking two operands. To determine if - // a conjunction is satisfied, the satisfaction of the first operand - // is checked. If that is not satisfied, the conjunction is not - // satisfied. Otherwise, the conjunction is satisfied if and only if - // the second operand is satisfied. - // LHS is instantiated while RHS is not. Skip creating invalid BinaryOp. - return LHSRes; - - ExprResult RHSRes = calculateConstraintSatisfaction( - S, RHS, Template, TemplateNameLoc, MLTAL, Satisfaction); - if (RHSRes.isInvalid()) - return ExprError(); - bool IsRHSSatisfied = Satisfaction.IsSatisfied; - // Current implementation adds diagnostic information about the falsity - // of each false atomic constraint expression when it evaluates them. - // When the evaluation results to `false || true`, the information - // generated during the evaluation of left-hand side is meaningless - // because the whole expression evaluates to true. - // The following code removes the irrelevant diagnostic information. - // FIXME: We should probably delay the addition of diagnostic information - // until we know the entire expression is false. - if (Op == clang::OO_PipePipe && IsRHSSatisfied) { - auto EffectiveDetailEnd = Satisfaction.Details.begin(); - std::advance(EffectiveDetailEnd, EffectiveDetailEndIndex); - Satisfaction.Details.erase(EffectiveDetailEnd, Satisfaction.Details.end()); - } - - if (!LHSRes.isUsable() || !RHSRes.isUsable()) - return ExprEmpty(); - - return BinaryOperator::Create(S.Context, LHSRes.get(), RHSRes.get(), - BinaryOperator::getOverloadedOpcode(Op), - S.Context.BoolTy, VK_PRValue, OK_Ordinary, - LHS->getBeginLoc(), FPOptionsOverride{}); -} - -static ExprResult calculateConstraintSatisfaction( - Sema &S, const CXXFoldExpr *FE, const NamedDecl *Template, - SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL, - ConstraintSatisfaction &Satisfaction) { - bool Conjunction = FE->getOperator() == BinaryOperatorKind::BO_LAnd; - size_t EffectiveDetailEndIndex = Satisfaction.Details.size(); - - ExprResult Out; - if (FE->isLeftFold() && FE->getInit()) { - Out = calculateConstraintSatisfaction(S, FE->getInit(), Template, - TemplateNameLoc, MLTAL, Satisfaction); - if (Out.isInvalid()) - return ExprError(); + Sema::SFINAETrap Trap(S); - // If the first clause of a conjunction is not satisfied, - // or if the first clause of a disjection is satisfied, - // we have established satisfaction of the whole constraint - // and we should not continue further. - if (Conjunction != Satisfaction.IsSatisfied) - return Out; - } - UnsignedOrNone NumExpansions = EvaluateFoldExpandedConstraintSize( - S, FE, Template, TemplateNameLoc, MLTAL, Satisfaction); - if (!NumExpansions) - return ExprError(); - for (unsigned I = 0; I < *NumExpansions; I++) { - Sema::ArgPackSubstIndexRAII SubstIndex(S, I); - ExprResult Res = calculateConstraintSatisfaction( - S, FE->getPattern(), Template, TemplateNameLoc, MLTAL, Satisfaction); - if (Res.isInvalid()) - return ExprError(); - bool IsRHSSatisfied = Satisfaction.IsSatisfied; - if (!Conjunction && IsRHSSatisfied) { - auto EffectiveDetailEnd = Satisfaction.Details.begin(); - std::advance(EffectiveDetailEnd, EffectiveDetailEndIndex); - Satisfaction.Details.erase(EffectiveDetailEnd, - Satisfaction.Details.end()); - } - if (Out.isUnset()) - Out = Res; - else if (!Res.isUnset()) { - Out = BinaryOperator::Create( - S.Context, Out.get(), Res.get(), FE->getOperator(), S.Context.BoolTy, - VK_PRValue, OK_Ordinary, FE->getBeginLoc(), FPOptionsOverride{}); - } - if (Conjunction != IsRHSSatisfied) - return Out; + TemplateArgumentListInfo SubstArgs; + Sema::ArgPackSubstIndexRAII SubstIndex( + S, Constraint.getPackSubstitutionIndex() + ? Constraint.getPackSubstitutionIndex() + : PackSubstitutionIndex); + + if (S.SubstTemplateArgumentsInParameterMapping( + Constraint.getParameterMapping(), Constraint.getBeginLoc(), MLTAL, + SubstArgs, /*BuildPackExpansionTypes=*/true)) { + Satisfaction.IsSatisfied = false; + return std::nullopt; } - if (FE->isRightFold() && FE->getInit()) { - ExprResult Res = calculateConstraintSatisfaction( - S, FE->getInit(), Template, TemplateNameLoc, MLTAL, Satisfaction); - if (Out.isInvalid()) - return ExprError(); - - if (Out.isUnset()) - Out = Res; - else if (!Res.isUnset()) { - Out = BinaryOperator::Create( - S.Context, Out.get(), Res.get(), FE->getOperator(), S.Context.BoolTy, - VK_PRValue, OK_Ordinary, FE->getBeginLoc(), FPOptionsOverride{}); + Sema::CheckTemplateArgumentInfo CTAI; + auto *TD = const_cast<TemplateDecl *>( + cast<TemplateDecl>(Constraint.getConstraintDecl())); + if (S.CheckTemplateArgumentList(TD, Constraint.getUsedTemplateParamList(), + TD->getLocation(), SubstArgs, + /*DefaultArguments=*/{}, + /*PartialTemplateArgs=*/false, CTAI)) + return std::nullopt; + const NormalizedConstraint::OccurenceList &Used = + Constraint.mappingOccurenceList(); + SubstitutedOuterMost = + llvm::to_vector_of<TemplateArgument>(MLTAL.getOutermost()); + unsigned Offset = 0; + for (unsigned I = 0, MappedIndex = 0; I < Used.size(); I++) { + TemplateArgument Arg; + if (Used[I]) + Arg = S.Context.getCanonicalTemplateArgument( + CTAI.SugaredConverted[MappedIndex++]); + if (I < SubstitutedOuterMost.size()) { + SubstitutedOuterMost[I] = Arg; + Offset = I + 1; + } else { + SubstitutedOuterMost.push_back(Arg); + Offset = SubstitutedOuterMost.size(); } } + if (Offset < SubstitutedOuterMost.size()) + SubstitutedOuterMost.erase(SubstitutedOuterMost.begin() + Offset); - if (Out.isUnset()) { - Satisfaction.IsSatisfied = Conjunction; - Out = S.BuildEmptyCXXFoldExpr(FE->getBeginLoc(), FE->getOperator()); - } - return Out; + MLTAL.replaceOutermostTemplateArguments( + const_cast<NamedDecl *>(Constraint.getConstraintDecl()), + SubstitutedOuterMost); + return std::move(MLTAL); } -static ExprResult calculateConstraintSatisfaction( - Sema &S, const Expr *ConstraintExpr, const NamedDecl *Template, - SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL, - ConstraintSatisfaction &Satisfaction) { - ConstraintExpr = ConstraintExpr->IgnoreParenImpCasts(); - - if (LogicalBinOp BO = ConstraintExpr) - return calculateConstraintSatisfaction( - S, BO.getLHS(), BO.getOp(), BO.getRHS(), Template, TemplateNameLoc, - MLTAL, Satisfaction); +ExprResult ConstraintSatisfactionChecker::EvaluateSlow( + const AtomicConstraint &Constraint, + const MultiLevelTemplateArgumentList &MLTAL) { - if (auto *C = dyn_cast<ExprWithCleanups>(ConstraintExpr)) { - // These aren't evaluated, so we don't care about cleanups, so we can just - // evaluate these as if the cleanups didn't exist. - return calculateConstraintSatisfaction( - S, C->getSubExpr(), Template, TemplateNameLoc, MLTAL, Satisfaction); - } - - if (auto *FE = dyn_cast<CXXFoldExpr>(ConstraintExpr); - FE && S.getLangOpts().CPlusPlus26 && - (FE->getOperator() == BinaryOperatorKind::BO_LAnd || - FE->getOperator() == BinaryOperatorKind::BO_LOr)) { - return calculateConstraintSatisfaction(S, FE, Template, TemplateNameLoc, - MLTAL, Satisfaction); + llvm::SmallVector<TemplateArgument> SubstitutedOuterMost; + std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs = + SubstitutionInTemplateArguments(Constraint, MLTAL, SubstitutedOuterMost); + if (!SubstitutedArgs) { + Satisfaction.IsSatisfied = false; + return ExprEmpty(); } - // FIXME: We should not treat ConceptSpecializationExpr as atomic constraints. - - // An atomic constraint expression + Sema::ArgPackSubstIndexRAII SubstIndex(S, PackSubstitutionIndex); ExprResult SubstitutedAtomicExpr = EvaluateAtomicConstraint( - S, ConstraintExpr, Template, TemplateNameLoc, MLTAL, Satisfaction); + Constraint.getConstraintExpr(), *SubstitutedArgs); if (SubstitutedAtomicExpr.isInvalid()) return ExprError(); - if (!SubstitutedAtomicExpr.isUsable()) + if (SubstitutedAtomicExpr.isUnset()) // Evaluator has decided satisfaction without yielding an expression. return ExprEmpty(); @@ -512,16 +640,16 @@ static ExprResult calculateConstraintSatisfaction( Satisfaction.ContainsErrors = true; PartialDiagnostic Msg = S.PDiag(diag::note_constraint_references_error); - SmallString<128> DiagString; - DiagString = ": "; - Msg.EmitToString(S.getDiagnostics(), DiagString); - unsigned MessageSize = DiagString.size(); - char *Mem = new (S.Context) char[MessageSize]; - memcpy(Mem, DiagString.c_str(), MessageSize); Satisfaction.Details.emplace_back( - new (S.Context) ConstraintSatisfaction::SubstitutionDiagnostic{ + new (S.Context) ConstraintSubstitutionDiagnostic{ SubstitutedAtomicExpr.get()->getBeginLoc(), - StringRef(Mem, MessageSize)}); + allocateStringFromConceptDiagnostic(S, Msg)}); + return SubstitutedAtomicExpr; + } + + if (SubstitutedAtomicExpr.get()->isValueDependent()) { + Satisfaction.IsSatisfied = true; + Satisfaction.ContainsErrors = false; return SubstitutedAtomicExpr; } @@ -552,21 +680,384 @@ static ExprResult calculateConstraintSatisfaction( return SubstitutedAtomicExpr; } -static ExprResult calculateConstraintSatisfaction( - Sema &S, const NamedDecl *Template, SourceLocation TemplateNameLoc, - const MultiLevelTemplateArgumentList &MLTAL, const Expr *ConstraintExpr, - ConstraintSatisfaction &Satisfaction) { +ExprResult ConstraintSatisfactionChecker::Evaluate( + const AtomicConstraint &Constraint, + const MultiLevelTemplateArgumentList &MLTAL) { + + unsigned Size = Satisfaction.Details.size(); + llvm::FoldingSetNodeID ID; + UnsignedOrNone OuterPackSubstIndex = + Constraint.getPackSubstitutionIndex() + ? Constraint.getPackSubstitutionIndex() + : PackSubstitutionIndex; + + ID.AddPointer(Constraint.getConstraintExpr()); + ID.AddInteger(OuterPackSubstIndex.toInternalRepresentation()); + HashParameterMapping(S, MLTAL, ID, OuterPackSubstIndex) + .VisitConstraint(Constraint); + + if (auto Iter = S.UnsubstitutedConstraintSatisfactionCache.find(ID); + Iter != S.UnsubstitutedConstraintSatisfactionCache.end()) { + + auto &Cached = Iter->second.Satisfaction; + Satisfaction.ContainsErrors = Cached.ContainsErrors; + Satisfaction.IsSatisfied = Cached.IsSatisfied; + Satisfaction.Details.insert(Satisfaction.Details.begin() + Size, + Cached.Details.begin(), Cached.Details.end()); + return Iter->second.SubstExpr; + } + + ExprResult E = EvaluateSlow(Constraint, MLTAL); + + UnsubstitutedConstraintSatisfactionCacheResult Cache; + Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors; + Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied; + std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(), + std::back_inserter(Cache.Satisfaction.Details)); + Cache.SubstExpr = E; + S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)}); + + return E; +} + +UnsignedOrNone +ConstraintSatisfactionChecker::EvaluateFoldExpandedConstraintSize( + const FoldExpandedConstraint &FE, + const MultiLevelTemplateArgumentList &MLTAL) { + + // We should ignore errors in the presence of packs of different size. + Sema::SFINAETrap Trap(S); + + Expr *Pattern = const_cast<Expr *>(FE.getPattern()); + + SmallVector<UnexpandedParameterPack, 2> Unexpanded; + S.collectUnexpandedParameterPacks(Pattern, Unexpanded); + assert(!Unexpanded.empty() && "Pack expansion without parameter packs?"); + bool Expand = true; + bool RetainExpansion = false; + UnsignedOrNone NumExpansions(std::nullopt); + if (S.CheckParameterPacksForExpansion( + Pattern->getExprLoc(), Pattern->getSourceRange(), Unexpanded, MLTAL, + /*FailOnPackProducingTemplates=*/false, Expand, RetainExpansion, + NumExpansions) || + !Expand || RetainExpansion) + return std::nullopt; + + if (NumExpansions && S.getLangOpts().BracketDepth < *NumExpansions) { + S.Diag(Pattern->getExprLoc(), + clang::diag::err_fold_expression_limit_exceeded) + << *NumExpansions << S.getLangOpts().BracketDepth + << Pattern->getSourceRange(); + S.Diag(Pattern->getExprLoc(), diag::note_bracket_depth); + return std::nullopt; + } + return NumExpansions; +} - return calculateConstraintSatisfaction(S, ConstraintExpr, Template, - TemplateNameLoc, MLTAL, Satisfaction); +ExprResult ConstraintSatisfactionChecker::EvaluateSlow( + const FoldExpandedConstraint &Constraint, + const MultiLevelTemplateArgumentList &MLTAL) { + + bool Conjunction = Constraint.getFoldOperator() == + FoldExpandedConstraint::FoldOperatorKind::And; + unsigned EffectiveDetailEndIndex = Satisfaction.Details.size(); + + llvm::SmallVector<TemplateArgument> SubstitutedOuterMost; + // FIXME: Is PackSubstitutionIndex correct? + llvm::SaveAndRestore _(PackSubstitutionIndex, S.ArgPackSubstIndex); + std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs = + SubstitutionInTemplateArguments( + static_cast<const NormalizedConstraintWithParamMapping &>(Constraint), + MLTAL, SubstitutedOuterMost); + if (!SubstitutedArgs) { + Satisfaction.IsSatisfied = false; + return ExprError(); + } + + ExprResult Out; + UnsignedOrNone NumExpansions = + EvaluateFoldExpandedConstraintSize(Constraint, *SubstitutedArgs); + if (!NumExpansions) + return ExprEmpty(); + + if (*NumExpansions == 0) { + Satisfaction.IsSatisfied = Conjunction; + return ExprEmpty(); + } + + for (unsigned I = 0; I < *NumExpansions; I++) { + Sema::ArgPackSubstIndexRAII SubstIndex(S, I); + Satisfaction.IsSatisfied = false; + Satisfaction.ContainsErrors = false; + ExprResult Expr = + ConstraintSatisfactionChecker(S, Template, TemplateNameLoc, + UnsignedOrNone(I), Satisfaction) + .Evaluate(Constraint.getNormalizedPattern(), *SubstitutedArgs); + if (Expr.isUsable()) { + if (Out.isUnset()) + Out = Expr; + else + Out = BinaryOperator::Create(S.Context, Out.get(), Expr.get(), + Conjunction ? BinaryOperatorKind::BO_LAnd + : BinaryOperatorKind::BO_LOr, + S.Context.BoolTy, VK_PRValue, OK_Ordinary, + Constraint.getBeginLoc(), + FPOptionsOverride{}); + } else { + assert(!Satisfaction.IsSatisfied); + } + if (!Conjunction && Satisfaction.IsSatisfied) { + Satisfaction.Details.erase(Satisfaction.Details.begin() + + EffectiveDetailEndIndex, + Satisfaction.Details.end()); + break; + } + if (Satisfaction.IsSatisfied != Conjunction) + return Out; + } + + return Out; +} + +ExprResult ConstraintSatisfactionChecker::Evaluate( + const FoldExpandedConstraint &Constraint, + const MultiLevelTemplateArgumentList &MLTAL) { + + llvm::FoldingSetNodeID ID; + ID.AddPointer(Constraint.getPattern()); + HashParameterMapping(S, MLTAL, ID, std::nullopt).VisitConstraint(Constraint); + + if (auto Iter = S.UnsubstitutedConstraintSatisfactionCache.find(ID); + Iter != S.UnsubstitutedConstraintSatisfactionCache.end()) { + + auto &Cached = Iter->second.Satisfaction; + Satisfaction.ContainsErrors = Cached.ContainsErrors; + Satisfaction.IsSatisfied = Cached.IsSatisfied; + Satisfaction.Details.insert(Satisfaction.Details.end(), + Cached.Details.begin(), Cached.Details.end()); + return Iter->second.SubstExpr; + } + + unsigned Size = Satisfaction.Details.size(); + + ExprResult E = EvaluateSlow(Constraint, MLTAL); + UnsubstitutedConstraintSatisfactionCacheResult Cache; + Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors; + Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied; + std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(), + std::back_inserter(Cache.Satisfaction.Details)); + Cache.SubstExpr = E; + S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)}); + return E; +} + +ExprResult ConstraintSatisfactionChecker::EvaluateSlow( + const ConceptIdConstraint &Constraint, + const MultiLevelTemplateArgumentList &MLTAL, unsigned Size) { + const ConceptReference *ConceptId = Constraint.getConceptId(); + + llvm::SmallVector<TemplateArgument> SubstitutedOuterMost; + std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs = + SubstitutionInTemplateArguments(Constraint, MLTAL, SubstitutedOuterMost); + + if (!SubstitutedArgs) { + Satisfaction.IsSatisfied = false; + // FIXME: diagnostics? + return ExprError(); + } + + Sema::SFINAETrap Trap(S); + Sema::ArgPackSubstIndexRAII SubstIndex( + S, Constraint.getPackSubstitutionIndex() + ? Constraint.getPackSubstitutionIndex() + : PackSubstitutionIndex); + + const ASTTemplateArgumentListInfo *Ori = + ConceptId->getTemplateArgsAsWritten(); + TemplateDeductionInfo Info(TemplateNameLoc); + Sema::InstantiatingTemplate _( + S, TemplateNameLoc, Sema::InstantiatingTemplate::ConstraintSubstitution{}, + const_cast<NamedDecl *>(Template), Info, Constraint.getSourceRange()); + + TemplateArgumentListInfo OutArgs(Ori->LAngleLoc, Ori->RAngleLoc); + if (S.SubstTemplateArguments(Ori->arguments(), *SubstitutedArgs, OutArgs) || + Trap.hasErrorOccurred()) { + Satisfaction.IsSatisfied = false; + if (!Trap.hasErrorOccurred()) + return ExprError(); + + PartialDiagnosticAt SubstDiag{SourceLocation(), + PartialDiagnostic::NullDiagnostic()}; + Info.takeSFINAEDiagnostic(SubstDiag); + // FIXME: This is an unfortunate consequence of there + // being no serialization code for PartialDiagnostics and the fact + // that serializing them would likely take a lot more storage than + // just storing them as strings. We would still like, in the + // future, to serialize the proper PartialDiagnostic as serializing + // it as a string defeats the purpose of the diagnostic mechanism. + Satisfaction.Details.insert( + Satisfaction.Details.begin() + Size, + new (S.Context) ConstraintSubstitutionDiagnostic{ + SubstDiag.first, + allocateStringFromConceptDiagnostic(S, SubstDiag.second)}); + return ExprError(); + } + + CXXScopeSpec SS; + SS.Adopt(ConceptId->getNestedNameSpecifierLoc()); + + ExprResult SubstitutedConceptId = S.CheckConceptTemplateId( + SS, ConceptId->getTemplateKWLoc(), ConceptId->getConceptNameInfo(), + ConceptId->getFoundDecl(), ConceptId->getNamedConcept(), &OutArgs, + /*DoCheckConstraintSatisfaction=*/false); + + if (SubstitutedConceptId.isInvalid() || Trap.hasErrorOccurred()) + return ExprError(); + + if (Size != Satisfaction.Details.size()) { + Satisfaction.Details.insert( + Satisfaction.Details.begin() + Size, + UnsatisfiedConstraintRecord( + SubstitutedConceptId.getAs<ConceptSpecializationExpr>() + ->getConceptReference())); + } + return SubstitutedConceptId; +} + +ExprResult ConstraintSatisfactionChecker::Evaluate( + const ConceptIdConstraint &Constraint, + const MultiLevelTemplateArgumentList &MLTAL) { + + const ConceptReference *ConceptId = Constraint.getConceptId(); + + UnsignedOrNone OuterPackSubstIndex = + Constraint.getPackSubstitutionIndex() + ? Constraint.getPackSubstitutionIndex() + : PackSubstitutionIndex; + + Sema::InstantiatingTemplate _(S, ConceptId->getBeginLoc(), + Sema::InstantiatingTemplate::ConstraintsCheck{}, + ConceptId->getNamedConcept(), + MLTAL.getInnermost(), + Constraint.getSourceRange()); + + unsigned Size = Satisfaction.Details.size(); + + ExprResult E = Evaluate(Constraint.getNormalizedConstraint(), MLTAL); + + if (!E.isUsable()) { + Satisfaction.Details.insert(Satisfaction.Details.begin() + Size, ConceptId); + return E; + } + + // ConceptIdConstraint is only relevant for diagnostics, + // so if the normalized constraint is satisfied, we should not + // substitute into the constraint. + if (Satisfaction.IsSatisfied) + return E; + + llvm::FoldingSetNodeID ID; + ID.AddPointer(Constraint.getConceptId()); + ID.AddInteger(OuterPackSubstIndex.toInternalRepresentation()); + HashParameterMapping(S, MLTAL, ID, OuterPackSubstIndex) + .VisitConstraint(Constraint); + + if (auto Iter = S.UnsubstitutedConstraintSatisfactionCache.find(ID); + Iter != S.UnsubstitutedConstraintSatisfactionCache.end()) { + + auto &Cached = Iter->second.Satisfaction; + Satisfaction.ContainsErrors = Cached.ContainsErrors; + Satisfaction.IsSatisfied = Cached.IsSatisfied; + Satisfaction.Details.insert(Satisfaction.Details.begin() + Size, + Cached.Details.begin(), Cached.Details.end()); + return Iter->second.SubstExpr; + } + + ExprResult CE = EvaluateSlow(Constraint, MLTAL, Size); + if (CE.isInvalid()) + return E; + UnsubstitutedConstraintSatisfactionCacheResult Cache; + Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors; + Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied; + std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(), + std::back_inserter(Cache.Satisfaction.Details)); + Cache.SubstExpr = CE; + S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)}); + return CE; +} + +ExprResult ConstraintSatisfactionChecker::Evaluate( + const CompoundConstraint &Constraint, + const MultiLevelTemplateArgumentList &MLTAL) { + + unsigned EffectiveDetailEndIndex = Satisfaction.Details.size(); + + bool Conjunction = + Constraint.getCompoundKind() == NormalizedConstraint::CCK_Conjunction; + + ExprResult LHS = Evaluate(Constraint.getLHS(), MLTAL); + + if (Conjunction && (!Satisfaction.IsSatisfied || Satisfaction.ContainsErrors)) + return LHS; + + if (!Conjunction && LHS.isUsable() && Satisfaction.IsSatisfied && + !Satisfaction.ContainsErrors) + return LHS; + + Satisfaction.ContainsErrors = false; + Satisfaction.IsSatisfied = false; + + ExprResult RHS = Evaluate(Constraint.getRHS(), MLTAL); + + if (RHS.isUsable() && Satisfaction.IsSatisfied && + !Satisfaction.ContainsErrors) + Satisfaction.Details.erase(Satisfaction.Details.begin() + + EffectiveDetailEndIndex, + Satisfaction.Details.end()); + + if (!LHS.isUsable()) + return RHS; + + if (!RHS.isUsable()) + return LHS; + + return BinaryOperator::Create(S.Context, LHS.get(), RHS.get(), + Conjunction ? BinaryOperatorKind::BO_LAnd + : BinaryOperatorKind::BO_LOr, + S.Context.BoolTy, VK_PRValue, OK_Ordinary, + Constraint.getBeginLoc(), FPOptionsOverride{}); +} + +ExprResult ConstraintSatisfactionChecker::Evaluate( + const NormalizedConstraint &Constraint, + const MultiLevelTemplateArgumentList &MLTAL) { + switch (Constraint.getKind()) { + case NormalizedConstraint::ConstraintKind::Atomic: + return Evaluate(static_cast<const AtomicConstraint &>(Constraint), MLTAL); + + case NormalizedConstraint::ConstraintKind::FoldExpanded: + return Evaluate(static_cast<const FoldExpandedConstraint &>(Constraint), + MLTAL); + + case NormalizedConstraint::ConstraintKind::ConceptId: + return Evaluate(static_cast<const ConceptIdConstraint &>(Constraint), + MLTAL); + + case NormalizedConstraint::ConstraintKind::Compound: + return Evaluate(static_cast<const CompoundConstraint &>(Constraint), MLTAL); + } } static bool CheckConstraintSatisfaction( Sema &S, const NamedDecl *Template, ArrayRef<AssociatedConstraint> AssociatedConstraints, - llvm::SmallVectorImpl<Expr *> &Converted, const MultiLevelTemplateArgumentList &TemplateArgsLists, - SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction) { + SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction, + Expr **ConvertedExpr, const ConceptReference *TopLevelConceptId = nullptr) { + + if (ConvertedExpr) + *ConvertedExpr = nullptr; + if (AssociatedConstraints.empty()) { Satisfaction.IsSatisfied = true; return false; @@ -578,57 +1069,60 @@ static bool CheckConstraintSatisfaction( return false; } - ArrayRef<TemplateArgument> TemplateArgs = - TemplateArgsLists.getNumSubstitutedLevels() > 0 - ? TemplateArgsLists.getOutermost() - : ArrayRef<TemplateArgument>{}; - Sema::InstantiatingTemplate Inst(S, TemplateIDRange.getBegin(), - Sema::InstantiatingTemplate::ConstraintsCheck{}, - const_cast<NamedDecl *>(Template), TemplateArgs, TemplateIDRange); - if (Inst.isInvalid()) + llvm::ArrayRef<TemplateArgument> Args; + if (TemplateArgsLists.getNumLevels() != 0) + Args = TemplateArgsLists.getInnermost(); + + std::optional<Sema::InstantiatingTemplate> SynthesisContext; + if (!TopLevelConceptId) { + SynthesisContext.emplace(S, TemplateIDRange.getBegin(), + Sema::InstantiatingTemplate::ConstraintsCheck{}, + const_cast<NamedDecl *>(Template), Args, + TemplateIDRange); + } + + const NormalizedConstraint *C = + S.getNormalizedAssociatedConstraints(Template, AssociatedConstraints); + if (!C) { + Satisfaction.IsSatisfied = false; return true; + } - for (const AssociatedConstraint &AC : AssociatedConstraints) { - if (AC.isNull()) - return true; + if (TopLevelConceptId) + C = ConceptIdConstraint::Create(S.getASTContext(), TopLevelConceptId, + const_cast<NormalizedConstraint *>(C), + Template, /*CSE=*/nullptr, + S.ArgPackSubstIndex); - Sema::ArgPackSubstIndexRAII _(S, AC.ArgPackSubstIndex); - ExprResult Res = calculateConstraintSatisfaction( - S, Template, TemplateIDRange.getBegin(), TemplateArgsLists, - AC.ConstraintExpr, Satisfaction); - if (Res.isInvalid()) - return true; + ExprResult Res = + ConstraintSatisfactionChecker(S, Template, TemplateIDRange.getBegin(), + S.ArgPackSubstIndex, Satisfaction) + .Evaluate(*C, TemplateArgsLists); + + if (Res.isInvalid()) + return true; + + if (Res.isUsable() && ConvertedExpr) + *ConvertedExpr = Res.get(); - Converted.push_back(Res.get()); - if (!Satisfaction.IsSatisfied) { - // Backfill the 'converted' list with nulls so we can keep the Converted - // and unconverted lists in sync. - Converted.append(AssociatedConstraints.size() - Converted.size(), - nullptr); - // [temp.constr.op] p2 - // [...] To determine if a conjunction is satisfied, the satisfaction - // of the first operand is checked. If that is not satisfied, the - // conjunction is not satisfied. [...] - return false; - } - } return false; } bool Sema::CheckConstraintSatisfaction( - const NamedDecl *Template, + ConstrainedDeclOrNestedRequirement Entity, ArrayRef<AssociatedConstraint> AssociatedConstraints, - llvm::SmallVectorImpl<Expr *> &ConvertedConstraints, const MultiLevelTemplateArgumentList &TemplateArgsLists, - SourceRange TemplateIDRange, ConstraintSatisfaction &OutSatisfaction) { + SourceRange TemplateIDRange, ConstraintSatisfaction &OutSatisfaction, + const ConceptReference *TopLevelConceptId, Expr **ConvertedExpr) { if (AssociatedConstraints.empty()) { OutSatisfaction.IsSatisfied = true; return false; } + const auto *Template = Entity.dyn_cast<const NamedDecl *>(); if (!Template) { return ::CheckConstraintSatisfaction( - *this, nullptr, AssociatedConstraints, ConvertedConstraints, - TemplateArgsLists, TemplateIDRange, OutSatisfaction); + *this, nullptr, AssociatedConstraints, TemplateArgsLists, + TemplateIDRange, OutSatisfaction, ConvertedExpr, TopLevelConceptId); } // Invalid templates could make their way here. Substituting them could result // in dependent expressions. @@ -643,10 +1137,15 @@ bool Sema::CheckConstraintSatisfaction( // here. llvm::SmallVector<TemplateArgument, 4> FlattenedArgs; for (auto List : TemplateArgsLists) - llvm::append_range(FlattenedArgs, List.Args); + for (const TemplateArgument &Arg : List.Args) + FlattenedArgs.emplace_back(Context.getCanonicalTemplateArgument(Arg)); + + const NamedDecl *Owner = Template; + if (TopLevelConceptId) + Owner = TopLevelConceptId->getNamedConcept(); llvm::FoldingSetNodeID ID; - ConstraintSatisfaction::Profile(ID, Context, Template, FlattenedArgs); + ConstraintSatisfaction::Profile(ID, Context, Owner, FlattenedArgs); void *InsertPos; if (auto *Cached = SatisfactionCache.FindNodeOrInsertPos(ID, InsertPos)) { OutSatisfaction = *Cached; @@ -654,11 +1153,11 @@ bool Sema::CheckConstraintSatisfaction( } auto Satisfaction = - std::make_unique<ConstraintSatisfaction>(Template, FlattenedArgs); - if (::CheckConstraintSatisfaction(*this, Template, AssociatedConstraints, - ConvertedConstraints, TemplateArgsLists, - TemplateIDRange, *Satisfaction)) { - OutSatisfaction = *Satisfaction; + std::make_unique<ConstraintSatisfaction>(Owner, FlattenedArgs); + if (::CheckConstraintSatisfaction( + *this, Template, AssociatedConstraints, TemplateArgsLists, + TemplateIDRange, *Satisfaction, ConvertedExpr, TopLevelConceptId)) { + OutSatisfaction = std::move(*Satisfaction); return true; } @@ -688,14 +1187,18 @@ bool Sema::CheckConstraintSatisfaction( const ConceptSpecializationExpr *ConstraintExpr, ConstraintSatisfaction &Satisfaction) { + llvm::SmallVector<AssociatedConstraint, 1> Constraints; + Constraints.emplace_back( + ConstraintExpr->getNamedConcept()->getConstraintExpr()); + MultiLevelTemplateArgumentList MLTAL(ConstraintExpr->getNamedConcept(), ConstraintExpr->getTemplateArguments(), true); - return calculateConstraintSatisfaction( - *this, ConstraintExpr, ConstraintExpr->getNamedConcept(), - ConstraintExpr->getConceptNameLoc(), MLTAL, Satisfaction) - .isInvalid(); + return CheckConstraintSatisfaction( + ConstraintExpr->getNamedConcept(), Constraints, MLTAL, + ConstraintExpr->getSourceRange(), Satisfaction, + ConstraintExpr->getConceptReference()); } bool Sema::SetupConstraintScope( @@ -854,50 +1357,6 @@ bool Sema::CheckFunctionConstraints(const FunctionDecl *FD, Satisfaction); } - -// Figure out the to-translation-unit depth for this function declaration for -// the purpose of seeing if they differ by constraints. This isn't the same as -// getTemplateDepth, because it includes already instantiated parents. -static unsigned -CalculateTemplateDepthForConstraints(Sema &S, const NamedDecl *ND, - bool SkipForSpecialization = false) { - MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs( - ND, ND->getLexicalDeclContext(), /*Final=*/false, - /*Innermost=*/std::nullopt, - /*RelativeToPrimary=*/true, - /*Pattern=*/nullptr, - /*ForConstraintInstantiation=*/true, SkipForSpecialization); - return MLTAL.getNumLevels(); -} - -namespace { - class AdjustConstraintDepth : public TreeTransform<AdjustConstraintDepth> { - unsigned TemplateDepth = 0; - public: - using inherited = TreeTransform<AdjustConstraintDepth>; - AdjustConstraintDepth(Sema &SemaRef, unsigned TemplateDepth) - : inherited(SemaRef), TemplateDepth(TemplateDepth) {} - - using inherited::TransformTemplateTypeParmType; - QualType TransformTemplateTypeParmType(TypeLocBuilder &TLB, - TemplateTypeParmTypeLoc TL, bool) { - const TemplateTypeParmType *T = TL.getTypePtr(); - - TemplateTypeParmDecl *NewTTPDecl = nullptr; - if (TemplateTypeParmDecl *OldTTPDecl = T->getDecl()) - NewTTPDecl = cast_or_null<TemplateTypeParmDecl>( - TransformDecl(TL.getNameLoc(), OldTTPDecl)); - - QualType Result = getSema().Context.getTemplateTypeParmType( - T->getDepth() + TemplateDepth, T->getIndex(), T->isParameterPack(), - NewTTPDecl); - TemplateTypeParmTypeLoc NewTL = TLB.push<TemplateTypeParmTypeLoc>(Result); - NewTL.setNameLoc(TL.getNameLoc()); - return Result; - } - }; -} // namespace - static const Expr *SubstituteConstraintExpressionWithoutSatisfaction( Sema &S, const Sema::TemplateCompareNewDeclInfo &DeclInfo, const Expr *ConstrExpr) { @@ -1161,73 +1620,61 @@ bool Sema::CheckFunctionTemplateConstraints( static void diagnoseUnsatisfiedRequirement(Sema &S, concepts::ExprRequirement *Req, bool First) { - assert(!Req->isSatisfied() - && "Diagnose() can only be used on an unsatisfied requirement"); + assert(!Req->isSatisfied() && + "Diagnose() can only be used on an unsatisfied requirement"); switch (Req->getSatisfactionStatus()) { - case concepts::ExprRequirement::SS_Dependent: - llvm_unreachable("Diagnosing a dependent requirement"); - break; - case concepts::ExprRequirement::SS_ExprSubstitutionFailure: { - auto *SubstDiag = Req->getExprSubstitutionDiagnostic(); - if (!SubstDiag->DiagMessage.empty()) - S.Diag(SubstDiag->DiagLoc, - diag::note_expr_requirement_expr_substitution_error) - << (int)First << SubstDiag->SubstitutedEntity - << SubstDiag->DiagMessage; - else - S.Diag(SubstDiag->DiagLoc, - diag::note_expr_requirement_expr_unknown_substitution_error) - << (int)First << SubstDiag->SubstitutedEntity; - break; - } - case concepts::ExprRequirement::SS_NoexceptNotMet: - S.Diag(Req->getNoexceptLoc(), - diag::note_expr_requirement_noexcept_not_met) - << (int)First << Req->getExpr(); - break; - case concepts::ExprRequirement::SS_TypeRequirementSubstitutionFailure: { - auto *SubstDiag = - Req->getReturnTypeRequirement().getSubstitutionDiagnostic(); - if (!SubstDiag->DiagMessage.empty()) - S.Diag(SubstDiag->DiagLoc, - diag::note_expr_requirement_type_requirement_substitution_error) - << (int)First << SubstDiag->SubstitutedEntity - << SubstDiag->DiagMessage; - else - S.Diag(SubstDiag->DiagLoc, - diag::note_expr_requirement_type_requirement_unknown_substitution_error) - << (int)First << SubstDiag->SubstitutedEntity; - break; - } - case concepts::ExprRequirement::SS_ConstraintsNotSatisfied: { - ConceptSpecializationExpr *ConstraintExpr = - Req->getReturnTypeRequirementSubstitutedConstraintExpr(); - if (ConstraintExpr->getTemplateArgsAsWritten()->NumTemplateArgs == 1) { - // A simple case - expr type is the type being constrained and the concept - // was not provided arguments. - Expr *e = Req->getExpr(); - S.Diag(e->getBeginLoc(), - diag::note_expr_requirement_constraints_not_satisfied_simple) - << (int)First << S.Context.getReferenceQualifiedType(e) - << ConstraintExpr->getNamedConcept(); - } else { - S.Diag(ConstraintExpr->getBeginLoc(), - diag::note_expr_requirement_constraints_not_satisfied) - << (int)First << ConstraintExpr; - } - S.DiagnoseUnsatisfiedConstraint(ConstraintExpr->getSatisfaction()); - break; - } - case concepts::ExprRequirement::SS_Satisfied: - llvm_unreachable("We checked this above"); + case concepts::ExprRequirement::SS_Dependent: + llvm_unreachable("Diagnosing a dependent requirement"); + break; + case concepts::ExprRequirement::SS_ExprSubstitutionFailure: { + auto *SubstDiag = Req->getExprSubstitutionDiagnostic(); + if (!SubstDiag->DiagMessage.empty()) + S.Diag(SubstDiag->DiagLoc, + diag::note_expr_requirement_expr_substitution_error) + << (int)First << SubstDiag->SubstitutedEntity + << SubstDiag->DiagMessage; + else + S.Diag(SubstDiag->DiagLoc, + diag::note_expr_requirement_expr_unknown_substitution_error) + << (int)First << SubstDiag->SubstitutedEntity; + break; + } + case concepts::ExprRequirement::SS_NoexceptNotMet: + S.Diag(Req->getNoexceptLoc(), diag::note_expr_requirement_noexcept_not_met) + << (int)First << Req->getExpr(); + break; + case concepts::ExprRequirement::SS_TypeRequirementSubstitutionFailure: { + auto *SubstDiag = + Req->getReturnTypeRequirement().getSubstitutionDiagnostic(); + if (!SubstDiag->DiagMessage.empty()) + S.Diag(SubstDiag->DiagLoc, + diag::note_expr_requirement_type_requirement_substitution_error) + << (int)First << SubstDiag->SubstitutedEntity + << SubstDiag->DiagMessage; + else + S.Diag( + SubstDiag->DiagLoc, + diag:: + note_expr_requirement_type_requirement_unknown_substitution_error) + << (int)First << SubstDiag->SubstitutedEntity; + break; + } + case concepts::ExprRequirement::SS_ConstraintsNotSatisfied: { + ConceptSpecializationExpr *ConstraintExpr = + Req->getReturnTypeRequirementSubstitutedConstraintExpr(); + S.DiagnoseUnsatisfiedConstraint(ConstraintExpr); + break; + } + case concepts::ExprRequirement::SS_Satisfied: + llvm_unreachable("We checked this above"); } } static void diagnoseUnsatisfiedRequirement(Sema &S, concepts::TypeRequirement *Req, bool First) { - assert(!Req->isSatisfied() - && "Diagnose() can only be used on an unsatisfied requirement"); + assert(!Req->isSatisfied() && + "Diagnose() can only be used on an unsatisfied requirement"); switch (Req->getSatisfactionStatus()) { case concepts::TypeRequirement::SS_Dependent: llvm_unreachable("Diagnosing a dependent requirement"); @@ -1235,9 +1682,9 @@ static void diagnoseUnsatisfiedRequirement(Sema &S, case concepts::TypeRequirement::SS_SubstitutionFailure: { auto *SubstDiag = Req->getSubstitutionDiagnostic(); if (!SubstDiag->DiagMessage.empty()) - S.Diag(SubstDiag->DiagLoc, - diag::note_type_requirement_substitution_error) << (int)First - << SubstDiag->SubstitutedEntity << SubstDiag->DiagMessage; + S.Diag(SubstDiag->DiagLoc, diag::note_type_requirement_substitution_error) + << (int)First << SubstDiag->SubstitutedEntity + << SubstDiag->DiagMessage; else S.Diag(SubstDiag->DiagLoc, diag::note_type_requirement_unknown_substitution_error) @@ -1249,31 +1696,53 @@ static void diagnoseUnsatisfiedRequirement(Sema &S, return; } } -static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S, - Expr *SubstExpr, - bool First = true); + +static void diagnoseUnsatisfiedConceptIdExpr(Sema &S, + const ConceptReference *Concept, + SourceLocation Loc, bool First) { + if (Concept->getTemplateArgsAsWritten()->NumTemplateArgs == 1) { + S.Diag( + Loc, + diag:: + note_single_arg_concept_specialization_constraint_evaluated_to_false) + << (int)First + << Concept->getTemplateArgsAsWritten()->arguments()[0].getArgument() + << Concept->getNamedConcept(); + } else { + S.Diag(Loc, diag::note_concept_specialization_constraint_evaluated_to_false) + << (int)First << Concept; + } +} + +static void diagnoseUnsatisfiedConstraintExpr( + Sema &S, const UnsatisfiedConstraintRecord &Record, SourceLocation Loc, + bool First, concepts::NestedRequirement *Req = nullptr); + +static void DiagnoseUnsatisfiedConstraint( + Sema &S, ArrayRef<UnsatisfiedConstraintRecord> Records, SourceLocation Loc, + bool First = true, concepts::NestedRequirement *Req = nullptr) { + for (auto &Record : Records) { + diagnoseUnsatisfiedConstraintExpr(S, Record, Loc, First, Req); + Loc = {}; + First = isa<const ConceptReference *>(Record); + } +} static void diagnoseUnsatisfiedRequirement(Sema &S, concepts::NestedRequirement *Req, bool First) { - using SubstitutionDiagnostic = std::pair<SourceLocation, StringRef>; - for (auto &Record : Req->getConstraintSatisfaction()) { - if (auto *SubstDiag = Record.dyn_cast<SubstitutionDiagnostic *>()) - S.Diag(SubstDiag->first, diag::note_nested_requirement_substitution_error) - << (int)First << Req->getInvalidConstraintEntity() - << SubstDiag->second; - else - diagnoseWellFormedUnsatisfiedConstraintExpr(S, Record.dyn_cast<Expr *>(), - First); - First = false; - } + DiagnoseUnsatisfiedConstraint(S, Req->getConstraintSatisfaction().records(), + Req->hasInvalidConstraint() + ? SourceLocation() + : Req->getConstraintExpr()->getExprLoc(), + First, Req); } static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S, - Expr *SubstExpr, + const Expr *SubstExpr, bool First) { SubstExpr = SubstExpr->IgnoreParenImpCasts(); - if (BinaryOperator *BO = dyn_cast<BinaryOperator>(SubstExpr)) { + if (const BinaryOperator *BO = dyn_cast<BinaryOperator>(SubstExpr)) { switch (BO->getOpcode()) { // These two cases will in practice only be reached when using fold // expressions with || and &&, since otherwise the || and && will have been @@ -1319,7 +1788,7 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S, BO->getRHS()->EvaluateAsInt(SimplifiedRHS, S.Context, Expr::SE_NoSideEffects, /*InConstantContext=*/true); - if (!SimplifiedLHS.Diag && ! SimplifiedRHS.Diag) { + if (!SimplifiedLHS.Diag && !SimplifiedRHS.Diag) { S.Diag(SubstExpr->getBeginLoc(), diag::note_atomic_constraint_evaluated_to_false_elaborated) << (int)First << SubstExpr @@ -1334,22 +1803,6 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S, default: break; } - } else if (auto *CSE = dyn_cast<ConceptSpecializationExpr>(SubstExpr)) { - if (CSE->getTemplateArgsAsWritten()->NumTemplateArgs == 1) { - S.Diag( - CSE->getSourceRange().getBegin(), - diag:: - note_single_arg_concept_specialization_constraint_evaluated_to_false) - << (int)First - << CSE->getTemplateArgsAsWritten()->arguments()[0].getArgument() - << CSE->getNamedConcept(); - } else { - S.Diag(SubstExpr->getSourceRange().getBegin(), - diag::note_concept_specialization_constraint_evaluated_to_false) - << (int)First << CSE; - } - S.DiagnoseUnsatisfiedConstraint(CSE->getSatisfaction()); - return; } else if (auto *RE = dyn_cast<RequiresExpr>(SubstExpr)) { // FIXME: RequiresExpr should store dependent diagnostics. for (concepts::Requirement *Req : RE->getRequirements()) @@ -1364,6 +1817,10 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S, break; } return; + } else if (auto *CSE = dyn_cast<ConceptSpecializationExpr>(SubstExpr)) { + // Drill down concept ids treated as atomic constraints + S.DiagnoseUnsatisfiedConstraint(CSE, First); + return; } else if (auto *TTE = dyn_cast<TypeTraitExpr>(SubstExpr); TTE && TTE->getTrait() == clang::TypeTrait::BTT_IsDeducible) { assert(TTE->getNumArgs() == 2); @@ -1379,216 +1836,332 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S, S.DiagnoseTypeTraitDetails(SubstExpr); } -template <typename SubstitutionDiagnostic> static void diagnoseUnsatisfiedConstraintExpr( - Sema &S, const llvm::PointerUnion<Expr *, SubstitutionDiagnostic *> &Record, - bool First = true) { - if (auto *Diag = Record.template dyn_cast<SubstitutionDiagnostic *>()) { - S.Diag(Diag->first, diag::note_substituted_constraint_expr_is_ill_formed) - << Diag->second; + Sema &S, const UnsatisfiedConstraintRecord &Record, SourceLocation Loc, + bool First, concepts::NestedRequirement *Req) { + if (auto *Diag = + Record + .template dyn_cast<const ConstraintSubstitutionDiagnostic *>()) { + if (Req) + S.Diag(Diag->first, diag::note_nested_requirement_substitution_error) + << (int)First << Req->getInvalidConstraintEntity() << Diag->second; + else + S.Diag(Diag->first, diag::note_substituted_constraint_expr_is_ill_formed) + << Diag->second; return; } - - diagnoseWellFormedUnsatisfiedConstraintExpr(S, cast<Expr *>(Record), First); + if (const auto *Concept = dyn_cast<const ConceptReference *>(Record)) { + if (Loc.isInvalid()) + Loc = Concept->getBeginLoc(); + diagnoseUnsatisfiedConceptIdExpr(S, Concept, Loc, First); + return; + } + diagnoseWellFormedUnsatisfiedConstraintExpr( + S, cast<const class Expr *>(Record), First); } -void -Sema::DiagnoseUnsatisfiedConstraint(const ConstraintSatisfaction& Satisfaction, - bool First) { +void Sema::DiagnoseUnsatisfiedConstraint( + const ConstraintSatisfaction &Satisfaction, SourceLocation Loc, + bool First) { + assert(!Satisfaction.IsSatisfied && "Attempted to diagnose a satisfied constraint"); - for (auto &Record : Satisfaction.Details) { - diagnoseUnsatisfiedConstraintExpr(*this, Record, First); - First = false; - } + ::DiagnoseUnsatisfiedConstraint(*this, Satisfaction.Details, Loc, First); } void Sema::DiagnoseUnsatisfiedConstraint( - const ASTConstraintSatisfaction &Satisfaction, - bool First) { + const ConceptSpecializationExpr *ConstraintExpr, bool First) { + + const ASTConstraintSatisfaction &Satisfaction = + ConstraintExpr->getSatisfaction(); + assert(!Satisfaction.IsSatisfied && "Attempted to diagnose a satisfied constraint"); - for (auto &Record : Satisfaction) { - diagnoseUnsatisfiedConstraintExpr(*this, Record, First); - First = false; - } + + ::DiagnoseUnsatisfiedConstraint(*this, Satisfaction.records(), + ConstraintExpr->getBeginLoc(), First); } -const NormalizedConstraint *Sema::getNormalizedAssociatedConstraints( - const NamedDecl *ConstrainedDecl, - ArrayRef<AssociatedConstraint> AssociatedConstraints) { - // In case the ConstrainedDecl comes from modules, it is necessary to use - // the canonical decl to avoid different atomic constraints with the 'same' - // declarations. - ConstrainedDecl = cast<NamedDecl>(ConstrainedDecl->getCanonicalDecl()); +namespace { - auto CacheEntry = NormalizationCache.find(ConstrainedDecl); - if (CacheEntry == NormalizationCache.end()) { - auto Normalized = NormalizedConstraint::fromAssociatedConstraints( - *this, ConstrainedDecl, AssociatedConstraints); - CacheEntry = - NormalizationCache - .try_emplace(ConstrainedDecl, - Normalized - ? new (Context) NormalizedConstraint( - std::move(*Normalized)) - : nullptr) - .first; - } - return CacheEntry->second; -} +class SubstituteParameterMappings { + Sema &SemaRef; -const NormalizedConstraint *clang::getNormalizedAssociatedConstraints( - Sema &S, const NamedDecl *ConstrainedDecl, - ArrayRef<AssociatedConstraint> AssociatedConstraints) { - return S.getNormalizedAssociatedConstraints(ConstrainedDecl, - AssociatedConstraints); -} + const MultiLevelTemplateArgumentList *MLTAL; + const ASTTemplateArgumentListInfo *ArgsAsWritten; -static bool -substituteParameterMappings(Sema &S, NormalizedConstraint &N, - ConceptDecl *Concept, - const MultiLevelTemplateArgumentList &MLTAL, - const ASTTemplateArgumentListInfo *ArgsAsWritten) { + bool InFoldExpr; - if (N.isCompound()) { - if (substituteParameterMappings(S, N.getLHS(), Concept, MLTAL, - ArgsAsWritten)) - return true; - return substituteParameterMappings(S, N.getRHS(), Concept, MLTAL, - ArgsAsWritten); - } + SubstituteParameterMappings(Sema &SemaRef, + const MultiLevelTemplateArgumentList *MLTAL, + const ASTTemplateArgumentListInfo *ArgsAsWritten, + bool InFoldExpr) + : SemaRef(SemaRef), MLTAL(MLTAL), ArgsAsWritten(ArgsAsWritten), + InFoldExpr(InFoldExpr) {} + + void buildParameterMapping(NormalizedConstraintWithParamMapping &N); + + bool substitute(NormalizedConstraintWithParamMapping &N); + + bool substitute(ConceptIdConstraint &CC); + +public: + SubstituteParameterMappings(Sema &SemaRef, bool InFoldExpr = false) + : SemaRef(SemaRef), MLTAL(nullptr), ArgsAsWritten(nullptr), + InFoldExpr(InFoldExpr) {} + + bool substitute(NormalizedConstraint &N); +}; - if (N.isFoldExpanded()) { - Sema::ArgPackSubstIndexRAII _(S, std::nullopt); - return substituteParameterMappings( - S, N.getFoldExpandedConstraint()->Constraint, Concept, MLTAL, - ArgsAsWritten); +void SubstituteParameterMappings::buildParameterMapping( + NormalizedConstraintWithParamMapping &N) { + TemplateParameterList *TemplateParams = + cast<TemplateDecl>(N.getConstraintDecl())->getTemplateParameters(); + + llvm::SmallBitVector OccurringIndices(TemplateParams->size()); + llvm::SmallBitVector OccurringIndicesForSubsumption(TemplateParams->size()); + + if (N.getKind() == NormalizedConstraint::ConstraintKind::Atomic) { + SemaRef.MarkUsedTemplateParameters( + static_cast<AtomicConstraint &>(N).getConstraintExpr(), + /*OnlyDeduced=*/false, + /*Depth=*/0, OccurringIndices); + + SemaRef.MarkUsedTemplateParametersForSubsumptionParameterMapping( + static_cast<AtomicConstraint &>(N).getConstraintExpr(), + /*Depth=*/0, OccurringIndicesForSubsumption); + + } else if (N.getKind() == + NormalizedConstraint::ConstraintKind::FoldExpanded) { + SemaRef.MarkUsedTemplateParameters( + static_cast<FoldExpandedConstraint &>(N).getPattern(), + /*OnlyDeduced=*/false, + /*Depth=*/0, OccurringIndices); + } else if (N.getKind() == NormalizedConstraint::ConstraintKind::ConceptId) { + auto *Args = static_cast<ConceptIdConstraint &>(N) + .getConceptId() + ->getTemplateArgsAsWritten(); + if (Args) + SemaRef.MarkUsedTemplateParameters(Args->arguments(), + /*Depth=*/0, OccurringIndices); } + TemplateArgumentLoc *TempArgs = + new (SemaRef.Context) TemplateArgumentLoc[OccurringIndices.count()]; + llvm::SmallVector<NamedDecl *> UsedParams; + for (unsigned I = 0, J = 0, C = TemplateParams->size(); I != C; ++I) { + SourceLocation Loc = ArgsAsWritten->NumTemplateArgs > I + ? ArgsAsWritten->arguments()[I].getLocation() + : SourceLocation(); + // FIXME: Investigate why we couldn't always preserve the SourceLoc. We + // can't assert Loc.isValid() now. + if (OccurringIndices[I]) { + NamedDecl *Param = TemplateParams->begin()[I]; + new (&(TempArgs)[J]) TemplateArgumentLoc( + SemaRef.getIdentityTemplateArgumentLoc(Param, Loc)); + UsedParams.push_back(Param); + J++; + } + } + auto *UsedList = TemplateParameterList::Create( + SemaRef.Context, TemplateParams->getTemplateLoc(), + TemplateParams->getLAngleLoc(), UsedParams, + /*RAngleLoc=*/SourceLocation(), + /*RequiresClause=*/nullptr); + unsigned Size = OccurringIndices.count(); + N.updateParameterMapping( + std::move(OccurringIndices), std::move(OccurringIndicesForSubsumption), + MutableArrayRef<TemplateArgumentLoc>{TempArgs, Size}, UsedList); +} - TemplateParameterList *TemplateParams = Concept->getTemplateParameters(); +bool SubstituteParameterMappings::substitute( + NormalizedConstraintWithParamMapping &N) { + if (!N.hasParameterMapping()) + buildParameterMapping(N); - AtomicConstraint &Atomic = *N.getAtomicConstraint(); - TemplateArgumentListInfo SubstArgs; - if (!Atomic.ParameterMapping) { - llvm::SmallBitVector OccurringIndices(TemplateParams->size()); - S.MarkUsedTemplateParameters(Atomic.ConstraintExpr, /*OnlyDeduced=*/false, - /*Depth=*/0, OccurringIndices); - TemplateArgumentLoc *TempArgs = - new (S.Context) TemplateArgumentLoc[OccurringIndices.count()]; - for (unsigned I = 0, J = 0, C = TemplateParams->size(); I != C; ++I) - if (OccurringIndices[I]) - new (&(TempArgs)[J++]) - TemplateArgumentLoc(S.getIdentityTemplateArgumentLoc( - TemplateParams->begin()[I], - // Here we assume we do not support things like - // template<typename A, typename B> - // concept C = ...; - // - // template<typename... Ts> requires C<Ts...> - // struct S { }; - // The above currently yields a diagnostic. - // We still might have default arguments for concept parameters. - ArgsAsWritten->NumTemplateArgs > I - ? ArgsAsWritten->arguments()[I].getLocation() - : SourceLocation())); - Atomic.ParameterMapping.emplace(TempArgs, OccurringIndices.count()); - } - SourceLocation InstLocBegin = - ArgsAsWritten->arguments().empty() - ? ArgsAsWritten->getLAngleLoc() - : ArgsAsWritten->arguments().front().getSourceRange().getBegin(); - SourceLocation InstLocEnd = - ArgsAsWritten->arguments().empty() - ? ArgsAsWritten->getRAngleLoc() - : ArgsAsWritten->arguments().front().getSourceRange().getEnd(); + SourceLocation InstLocBegin, InstLocEnd; + llvm::ArrayRef Arguments = ArgsAsWritten->arguments(); + if (Arguments.empty()) { + InstLocBegin = ArgsAsWritten->getLAngleLoc(); + InstLocEnd = ArgsAsWritten->getRAngleLoc(); + } else { + auto SR = Arguments[0].getSourceRange(); + InstLocBegin = SR.getBegin(); + InstLocEnd = SR.getEnd(); + } Sema::InstantiatingTemplate Inst( - S, InstLocBegin, + SemaRef, InstLocBegin, Sema::InstantiatingTemplate::ParameterMappingSubstitution{}, - const_cast<NamedDecl *>(Atomic.ConstraintDecl), + const_cast<NamedDecl *>(N.getConstraintDecl()), {InstLocBegin, InstLocEnd}); if (Inst.isInvalid()) return true; - if (S.SubstTemplateArguments(*Atomic.ParameterMapping, MLTAL, SubstArgs)) + + // TransformTemplateArguments is unable to preserve the source location of a + // pack. The SourceLocation is necessary for the instantiation location. + // FIXME: The BaseLoc will be used as the location of the pack expansion, + // which is wrong. + TemplateArgumentListInfo SubstArgs; + if (SemaRef.SubstTemplateArgumentsInParameterMapping( + N.getParameterMapping(), N.getBeginLoc(), *MLTAL, SubstArgs, + /*BuildPackExpansionTypes=*/!InFoldExpr)) + return true; + Sema::CheckTemplateArgumentInfo CTAI; + auto *TD = + const_cast<TemplateDecl *>(cast<TemplateDecl>(N.getConstraintDecl())); + if (SemaRef.CheckTemplateArgumentList(TD, N.getUsedTemplateParamList(), + TD->getLocation(), SubstArgs, + /*DefaultArguments=*/{}, + /*PartialTemplateArgs=*/false, CTAI)) return true; TemplateArgumentLoc *TempArgs = - new (S.Context) TemplateArgumentLoc[SubstArgs.size()]; - std::copy(SubstArgs.arguments().begin(), SubstArgs.arguments().end(), - TempArgs); - Atomic.ParameterMapping.emplace(TempArgs, SubstArgs.size()); + new (SemaRef.Context) TemplateArgumentLoc[CTAI.SugaredConverted.size()]; + + for (unsigned I = 0; I < CTAI.SugaredConverted.size(); ++I) { + SourceLocation Loc; + // If this is an empty pack, we have no corresponding SubstArgs. + if (I < SubstArgs.size()) + Loc = SubstArgs.arguments()[I].getLocation(); + + TempArgs[I] = SemaRef.getTrivialTemplateArgumentLoc( + CTAI.SugaredConverted[I], QualType(), Loc); + } + + MutableArrayRef<TemplateArgumentLoc> Mapping(TempArgs, + CTAI.SugaredConverted.size()); + N.updateParameterMapping(N.mappingOccurenceList(), + N.mappingOccurenceListForSubsumption(), Mapping, + N.getUsedTemplateParamList()); return false; } -static bool substituteParameterMappings(Sema &S, NormalizedConstraint &N, - const ConceptSpecializationExpr *CSE) { - MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs( - CSE->getNamedConcept(), CSE->getNamedConcept()->getLexicalDeclContext(), - /*Final=*/false, CSE->getTemplateArguments(), - /*RelativeToPrimary=*/true, - /*Pattern=*/nullptr, - /*ForConstraintInstantiation=*/true); +bool SubstituteParameterMappings::substitute(ConceptIdConstraint &CC) { + assert(CC.getConstraintDecl() && MLTAL && ArgsAsWritten); - return substituteParameterMappings(S, N, CSE->getNamedConcept(), MLTAL, - CSE->getTemplateArgsAsWritten()); -} + if (substitute(static_cast<NormalizedConstraintWithParamMapping &>(CC))) + return true; -NormalizedConstraint::NormalizedConstraint(ASTContext &C, - NormalizedConstraint LHS, - NormalizedConstraint RHS, - CompoundConstraintKind Kind) - : Constraint{CompoundConstraint{ - new(C) NormalizedConstraintPair{std::move(LHS), std::move(RHS)}, - Kind}} {} - -NormalizedConstraint::NormalizedConstraint(ASTContext &C, - const NormalizedConstraint &Other) { - if (Other.isAtomic()) { - Constraint = new (C) AtomicConstraint(*Other.getAtomicConstraint()); - } else if (Other.isFoldExpanded()) { - Constraint = new (C) FoldExpandedConstraint( - Other.getFoldExpandedConstraint()->Kind, - NormalizedConstraint(C, Other.getFoldExpandedConstraint()->Constraint), - Other.getFoldExpandedConstraint()->Pattern); + auto *CSE = CC.getConceptSpecializationExpr(); + assert(CSE); + assert(!CC.getBeginLoc().isInvalid()); + + SourceLocation InstLocBegin, InstLocEnd; + if (llvm::ArrayRef Arguments = ArgsAsWritten->arguments(); + Arguments.empty()) { + InstLocBegin = ArgsAsWritten->getLAngleLoc(); + InstLocEnd = ArgsAsWritten->getRAngleLoc(); } else { - Constraint = CompoundConstraint( - new (C) - NormalizedConstraintPair{NormalizedConstraint(C, Other.getLHS()), - NormalizedConstraint(C, Other.getRHS())}, - Other.getCompoundKind()); + auto SR = Arguments[0].getSourceRange(); + InstLocBegin = SR.getBegin(); + InstLocEnd = SR.getEnd(); } -} + // This is useful for name lookup across modules; see Sema::getLookupModules. + Sema::InstantiatingTemplate Inst( + SemaRef, InstLocBegin, + Sema::InstantiatingTemplate::ParameterMappingSubstitution{}, + const_cast<NamedDecl *>(CC.getConstraintDecl()), + {InstLocBegin, InstLocEnd}); + if (Inst.isInvalid()) + return true; -NormalizedConstraint &NormalizedConstraint::getLHS() const { - assert(isCompound() && "getLHS called on a non-compound constraint."); - return cast<CompoundConstraint>(Constraint).getPointer()->LHS; + TemplateArgumentListInfo Out; + // TransformTemplateArguments is unable to preserve the source location of a + // pack. The SourceLocation is necessary for the instantiation location. + // FIXME: The BaseLoc will be used as the location of the pack expansion, + // which is wrong. + const ASTTemplateArgumentListInfo *ArgsAsWritten = + CSE->getTemplateArgsAsWritten(); + if (SemaRef.SubstTemplateArgumentsInParameterMapping( + ArgsAsWritten->arguments(), CC.getBeginLoc(), *MLTAL, Out, + /*BuildPackExpansionTypes=*/!InFoldExpr)) + return true; + Sema::CheckTemplateArgumentInfo CTAI; + if (SemaRef.CheckTemplateArgumentList(CSE->getNamedConcept(), + CSE->getConceptNameInfo().getLoc(), Out, + /*DefaultArgs=*/{}, + /*PartialTemplateArgs=*/false, CTAI, + /*UpdateArgsWithConversions=*/false)) + return true; + auto TemplateArgs = *MLTAL; + TemplateArgs.replaceOutermostTemplateArguments( + TemplateArgs.getAssociatedDecl(0).first, CTAI.SugaredConverted); + return SubstituteParameterMappings(SemaRef, &TemplateArgs, ArgsAsWritten, + InFoldExpr) + .substitute(CC.getNormalizedConstraint()); } -NormalizedConstraint &NormalizedConstraint::getRHS() const { - assert(isCompound() && "getRHS called on a non-compound constraint."); - return cast<CompoundConstraint>(Constraint).getPointer()->RHS; +bool SubstituteParameterMappings::substitute(NormalizedConstraint &N) { + switch (N.getKind()) { + case NormalizedConstraint::ConstraintKind::Atomic: { + if (!MLTAL) { + assert(!ArgsAsWritten); + return false; + } + return substitute(static_cast<NormalizedConstraintWithParamMapping &>(N)); + } + case NormalizedConstraint::ConstraintKind::FoldExpanded: { + auto &FE = static_cast<FoldExpandedConstraint &>(N); + if (!MLTAL) { + llvm::SaveAndRestore _1(InFoldExpr, true); + assert(!ArgsAsWritten); + return substitute(FE.getNormalizedPattern()); + } + Sema::ArgPackSubstIndexRAII _(SemaRef, std::nullopt); + substitute(static_cast<NormalizedConstraintWithParamMapping &>(FE)); + return SubstituteParameterMappings(SemaRef, /*InFoldExpr=*/true) + .substitute(FE.getNormalizedPattern()); + } + case NormalizedConstraint::ConstraintKind::ConceptId: { + auto &CC = static_cast<ConceptIdConstraint &>(N); + if (MLTAL) { + assert(ArgsAsWritten); + return substitute(CC); + } + assert(!ArgsAsWritten); + const ConceptSpecializationExpr *CSE = CC.getConceptSpecializationExpr(); + ConceptDecl *Concept = CSE->getNamedConcept(); + MultiLevelTemplateArgumentList MLTAL = SemaRef.getTemplateInstantiationArgs( + Concept, Concept->getLexicalDeclContext(), + /*Final=*/true, CSE->getTemplateArguments(), + /*RelativeToPrimary=*/true, + /*Pattern=*/nullptr, + /*ForConstraintInstantiation=*/true); + + return SubstituteParameterMappings( + SemaRef, &MLTAL, CSE->getTemplateArgsAsWritten(), InFoldExpr) + .substitute(CC.getNormalizedConstraint()); + } + case NormalizedConstraint::ConstraintKind::Compound: { + auto &Compound = static_cast<CompoundConstraint &>(N); + if (substitute(Compound.getLHS())) + return true; + return substitute(Compound.getRHS()); + } + } } -std::optional<NormalizedConstraint> -NormalizedConstraint::fromAssociatedConstraints( +} // namespace + +NormalizedConstraint *NormalizedConstraint::fromAssociatedConstraints( Sema &S, const NamedDecl *D, ArrayRef<AssociatedConstraint> ACs) { assert(ACs.size() != 0); - auto Conjunction = fromConstraintExpr(S, D, ACs[0].ConstraintExpr); + auto *Conjunction = + fromConstraintExpr(S, D, ACs[0].ConstraintExpr, ACs[0].ArgPackSubstIndex); if (!Conjunction) - return std::nullopt; + return nullptr; for (unsigned I = 1; I < ACs.size(); ++I) { - auto Next = fromConstraintExpr(S, D, ACs[I].ConstraintExpr); + auto *Next = fromConstraintExpr(S, D, ACs[I].ConstraintExpr, + ACs[I].ArgPackSubstIndex); if (!Next) - return std::nullopt; - *Conjunction = NormalizedConstraint(S.Context, std::move(*Conjunction), - std::move(*Next), CCK_Conjunction); + return nullptr; + Conjunction = CompoundConstraint::CreateConjunction(S.getASTContext(), + Conjunction, Next); } return Conjunction; } -std::optional<NormalizedConstraint> -NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D, - const Expr *E) { +NormalizedConstraint *NormalizedConstraint::fromConstraintExpr( + Sema &S, const NamedDecl *D, const Expr *E, UnsignedOrNone SubstIndex) { assert(E != nullptr); // C++ [temp.constr.normal]p1.1 @@ -1597,23 +2170,29 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D, // [...] E = E->IgnoreParenImpCasts(); + llvm::FoldingSetNodeID ID; + if (D && DiagRecursiveConstraintEval(S, ID, D, E)) { + return nullptr; + } + SatisfactionStackRAII StackRAII(S, D, ID); + // C++2a [temp.param]p4: // [...] If T is not a pack, then E is E', otherwise E is (E' && ...). // Fold expression is considered atomic constraints per current wording. // See http://cplusplus.github.io/concepts-ts/ts-active.html#28 if (LogicalBinOp BO = E) { - auto LHS = fromConstraintExpr(S, D, BO.getLHS()); + auto *LHS = fromConstraintExpr(S, D, BO.getLHS(), SubstIndex); if (!LHS) - return std::nullopt; - auto RHS = fromConstraintExpr(S, D, BO.getRHS()); + return nullptr; + auto *RHS = fromConstraintExpr(S, D, BO.getRHS(), SubstIndex); if (!RHS) - return std::nullopt; + return nullptr; - return NormalizedConstraint(S.Context, std::move(*LHS), std::move(*RHS), - BO.isAnd() ? CCK_Conjunction : CCK_Disjunction); + return CompoundConstraint::Create( + S.Context, LHS, BO.isAnd() ? CCK_Conjunction : CCK_Disjunction, RHS); } else if (auto *CSE = dyn_cast<const ConceptSpecializationExpr>(E)) { - const NormalizedConstraint *SubNF; + NormalizedConstraint *SubNF; { Sema::InstantiatingTemplate Inst( S, CSE->getExprLoc(), @@ -1621,7 +2200,7 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D, // FIXME: improve const-correctness of InstantiatingTemplate const_cast<NamedDecl *>(D), CSE->getSourceRange()); if (Inst.isInvalid()) - return std::nullopt; + return nullptr; // C++ [temp.constr.normal]p1.1 // [...] // The normal form of an id-expression of the form C<A1, A2, ..., AN>, @@ -1631,20 +2210,21 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D, // constraint. If any such substitution results in an invalid type or // expression, the program is ill-formed; no diagnostic is required. // [...] - ConceptDecl *CD = CSE->getNamedConcept(); - SubNF = S.getNormalizedAssociatedConstraints( - CD, AssociatedConstraint(CD->getConstraintExpr())); + + // Use canonical declarations to merge ConceptDecls across + // different modules. + ConceptDecl *CD = CSE->getNamedConcept()->getCanonicalDecl(); + SubNF = NormalizedConstraint::fromAssociatedConstraints( + S, CD, AssociatedConstraint(CD->getConstraintExpr(), SubstIndex)); + if (!SubNF) - return std::nullopt; + return nullptr; } - std::optional<NormalizedConstraint> New; - New.emplace(S.Context, *SubNF); - - if (substituteParameterMappings(S, *New, CSE)) - return std::nullopt; + return ConceptIdConstraint::Create(S.getASTContext(), + CSE->getConceptReference(), SubNF, D, + CSE, SubstIndex); - return New; } else if (auto *FE = dyn_cast<const CXXFoldExpr>(E); FE && S.getLangOpts().CPlusPlus26 && (FE->getOperator() == BinaryOperatorKind::BO_LAnd || @@ -1658,31 +2238,61 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D, : FoldExpandedConstraint::FoldOperatorKind::Or; if (FE->getInit()) { - auto LHS = fromConstraintExpr(S, D, FE->getLHS()); - auto RHS = fromConstraintExpr(S, D, FE->getRHS()); + auto *LHS = fromConstraintExpr(S, D, FE->getLHS(), SubstIndex); + auto *RHS = fromConstraintExpr(S, D, FE->getRHS(), SubstIndex); if (!LHS || !RHS) - return std::nullopt; + return nullptr; if (FE->isRightFold()) - RHS = NormalizedConstraint{new (S.Context) FoldExpandedConstraint{ - Kind, std::move(*RHS), FE->getPattern()}}; + LHS = FoldExpandedConstraint::Create(S.getASTContext(), + FE->getPattern(), D, Kind, LHS); else - LHS = NormalizedConstraint{new (S.Context) FoldExpandedConstraint{ - Kind, std::move(*LHS), FE->getPattern()}}; - - return NormalizedConstraint( - S.Context, std::move(*LHS), std::move(*RHS), - FE->getOperator() == BinaryOperatorKind::BO_LAnd ? CCK_Conjunction - : CCK_Disjunction); + RHS = FoldExpandedConstraint::Create(S.getASTContext(), + FE->getPattern(), D, Kind, RHS); + + return CompoundConstraint::Create( + S.getASTContext(), LHS, + (FE->getOperator() == BinaryOperatorKind::BO_LAnd ? CCK_Conjunction + : CCK_Disjunction), + RHS); } - auto Sub = fromConstraintExpr(S, D, FE->getPattern()); + auto *Sub = fromConstraintExpr(S, D, FE->getPattern(), SubstIndex); if (!Sub) - return std::nullopt; - return NormalizedConstraint{new (S.Context) FoldExpandedConstraint{ - Kind, std::move(*Sub), FE->getPattern()}}; + return nullptr; + return FoldExpandedConstraint::Create(S.getASTContext(), FE->getPattern(), + D, Kind, Sub); } + return AtomicConstraint::Create(S.getASTContext(), E, D, SubstIndex); +} - return NormalizedConstraint{new (S.Context) AtomicConstraint(E, D)}; +const NormalizedConstraint *Sema::getNormalizedAssociatedConstraints( + ConstrainedDeclOrNestedRequirement ConstrainedDeclOrNestedReq, + ArrayRef<AssociatedConstraint> AssociatedConstraints) { + if (!ConstrainedDeclOrNestedReq) { + auto *Normalized = NormalizedConstraint::fromAssociatedConstraints( + *this, nullptr, AssociatedConstraints); + if (!Normalized || + SubstituteParameterMappings(*this).substitute(*Normalized)) + return nullptr; + + return Normalized; + } + + // FIXME: ConstrainedDeclOrNestedReq is never a NestedRequirement! + const NamedDecl *ND = + ConstrainedDeclOrNestedReq.dyn_cast<const NamedDecl *>(); + auto CacheEntry = NormalizationCache.find(ConstrainedDeclOrNestedReq); + if (CacheEntry == NormalizationCache.end()) { + auto *Normalized = NormalizedConstraint::fromAssociatedConstraints( + *this, ND, AssociatedConstraints); + CacheEntry = + NormalizationCache.try_emplace(ConstrainedDeclOrNestedReq, Normalized) + .first; + if (!Normalized || + SubstituteParameterMappings(*this).substitute(*Normalized)) + return nullptr; + } + return CacheEntry->second; } bool FoldExpandedConstraint::AreCompatibleForSubsumption( @@ -1693,8 +2303,10 @@ bool FoldExpandedConstraint::AreCompatibleForSubsumption( // if their respective constraints both contain an equivalent unexpanded pack. llvm::SmallVector<UnexpandedParameterPack> APacks, BPacks; - Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(A.Pattern), APacks); - Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(B.Pattern), BPacks); + Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(A.getPattern()), + APacks); + Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(B.getPattern()), + BPacks); for (const UnexpandedParameterPack &APack : APacks) { auto ADI = getDepthAndIndex(APack); @@ -1788,7 +2400,7 @@ bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic( const AtomicConstraint &B) { if (!A.hasMatchingParameterMapping(Context, B)) return false; - const Expr *EA = A.ConstraintExpr, *EB = B.ConstraintExpr; + const Expr *EA = A.getConstraintExpr(), *EB = B.getConstraintExpr(); if (EA == EB) return true; @@ -1841,24 +2453,6 @@ bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic( return true; } -NormalizedConstraint::CompoundConstraintKind -NormalizedConstraint::getCompoundKind() const { - assert(isCompound() && "getCompoundKind on a non-compound constraint.."); - return cast<CompoundConstraint>(Constraint).getInt(); -} - -AtomicConstraint *NormalizedConstraint::getAtomicConstraint() const { - assert(isAtomic() && "getAtomicConstraint called on non-atomic constraint."); - return cast<AtomicConstraint *>(Constraint); -} - -FoldExpandedConstraint * -NormalizedConstraint::getFoldExpandedConstraint() const { - assert(isFoldExpanded() && - "getFoldExpandedConstraint called on non-fold-expanded constraint."); - return cast<FoldExpandedConstraint *>(Constraint); -} - // // // ------------------------ Subsumption ----------------------------------- @@ -1874,8 +2468,8 @@ uint16_t SubsumptionChecker::getNewLiteralId() { return NextID++; } -auto SubsumptionChecker::find(AtomicConstraint *Ori) -> Literal { - auto &Elems = AtomicMap[Ori->ConstraintExpr]; +auto SubsumptionChecker::find(const AtomicConstraint *Ori) -> Literal { + auto &Elems = AtomicMap[Ori->getConstraintExpr()]; // C++ [temp.constr.order] p2 // - an atomic constraint A subsumes another atomic constraint B // if and only if the A and B are identical [...] @@ -1891,13 +2485,16 @@ auto SubsumptionChecker::find(AtomicConstraint *Ori) -> Literal { // subsumes another, their literal will be the same llvm::FoldingSetNodeID ID; - const auto &Mapping = Ori->ParameterMapping; - ID.AddBoolean(Mapping.has_value()); - if (Mapping) { - for (const TemplateArgumentLoc &TAL : *Mapping) { - SemaRef.getASTContext() - .getCanonicalTemplateArgument(TAL.getArgument()) - .Profile(ID, SemaRef.getASTContext()); + ID.AddBoolean(Ori->hasParameterMapping()); + if (Ori->hasParameterMapping()) { + const auto &Mapping = Ori->getParameterMapping(); + const NormalizedConstraint::OccurenceList &Indexes = + Ori->mappingOccurenceListForSubsumption(); + for (auto [Idx, TAL] : llvm::enumerate(Mapping)) { + if (Indexes[Idx]) + SemaRef.getASTContext() + .getCanonicalTemplateArgument(TAL.getArgument()) + .Profile(ID, SemaRef.getASTContext()); } } auto It = Elems.find(ID); @@ -1912,11 +2509,11 @@ auto SubsumptionChecker::find(AtomicConstraint *Ori) -> Literal { return It->getSecond().ID; } -auto SubsumptionChecker::find(FoldExpandedConstraint *Ori) -> Literal { - auto &Elems = FoldMap[Ori->Pattern]; +auto SubsumptionChecker::find(const FoldExpandedConstraint *Ori) -> Literal { + auto &Elems = FoldMap[Ori->getPattern()]; FoldExpendedConstraintKey K; - K.Kind = Ori->Kind; + K.Kind = Ori->getFoldOperator(); auto It = llvm::find_if(Elems, [&K](const FoldExpendedConstraintKey &Other) { return K.Kind == Other.Kind; @@ -1960,38 +2557,47 @@ FormulaType SubsumptionChecker::Normalize(const NormalizedConstraint &NC) { AddUniqueClauseToFormula(Res, std::move(C)); }; - if (NC.isAtomic()) - return {{find(NC.getAtomicConstraint())}}; + switch (NC.getKind()) { - if (NC.isFoldExpanded()) - return {{find(NC.getFoldExpandedConstraint())}}; + case NormalizedConstraint::ConstraintKind::Atomic: + return {{find(&static_cast<const AtomicConstraint &>(NC))}}; - FormulaType Left, Right; - SemaRef.runWithSufficientStackSpace(SourceLocation(), [&] { - Left = Normalize<FormulaType>(NC.getLHS()); - Right = Normalize<FormulaType>(NC.getRHS()); - }); + case NormalizedConstraint::ConstraintKind::FoldExpanded: + return {{find(&static_cast<const FoldExpandedConstraint &>(NC))}}; - if (NC.getCompoundKind() == FormulaType::Kind) { - auto SizeLeft = Left.size(); - Res = std::move(Left); - Res.reserve(SizeLeft + Right.size()); - std::for_each(std::make_move_iterator(Right.begin()), - std::make_move_iterator(Right.end()), Add); - return Res; - } + case NormalizedConstraint::ConstraintKind::ConceptId: + return Normalize<FormulaType>( + static_cast<const ConceptIdConstraint &>(NC).getNormalizedConstraint()); + + case NormalizedConstraint::ConstraintKind::Compound: { + const auto &Compound = static_cast<const CompoundConstraint &>(NC); + FormulaType Left, Right; + SemaRef.runWithSufficientStackSpace(SourceLocation(), [&] { + Left = Normalize<FormulaType>(Compound.getLHS()); + Right = Normalize<FormulaType>(Compound.getRHS()); + }); + + if (Compound.getCompoundKind() == FormulaType::Kind) { + Res = std::move(Left); + Res.reserve(Left.size() + Right.size()); + std::for_each(std::make_move_iterator(Right.begin()), + std::make_move_iterator(Right.end()), Add); + return Res; + } - Res.reserve(Left.size() * Right.size()); - for (const auto <ransform : Left) { - for (const auto &RTransform : Right) { - Clause Combined; - Combined.reserve(LTransform.size() + RTransform.size()); - llvm::append_range(Combined, LTransform); - llvm::append_range(Combined, RTransform); - Add(std::move(Combined)); + Res.reserve(Left.size() * Right.size()); + for (const auto <ransform : Left) { + for (const auto &RTransform : Right) { + Clause Combined; + Combined.reserve(LTransform.size() + RTransform.size()); + llvm::copy(LTransform, std::back_inserter(Combined)); + llvm::copy(RTransform, std::back_inserter(Combined)); + Add(std::move(Combined)); + } } + return Res; + } } - return Res; } void SubsumptionChecker::AddUniqueClauseToFormula(Formula &F, Clause C) { @@ -2006,12 +2612,12 @@ std::optional<bool> SubsumptionChecker::Subsumes( const NamedDecl *DP, ArrayRef<AssociatedConstraint> P, const NamedDecl *DQ, ArrayRef<AssociatedConstraint> Q) { const NormalizedConstraint *PNormalized = - getNormalizedAssociatedConstraints(SemaRef, DP, P); + SemaRef.getNormalizedAssociatedConstraints(DP, P); if (!PNormalized) return std::nullopt; const NormalizedConstraint *QNormalized = - getNormalizedAssociatedConstraints(SemaRef, DQ, Q); + SemaRef.getNormalizedAssociatedConstraints(DQ, Q); if (!QNormalized) return std::nullopt; @@ -2061,9 +2667,9 @@ bool SubsumptionChecker::Subsumes(const FoldExpandedConstraint *A, // constraint B if they are compatible for subsumption, have the same // fold-operator, and the constraint of A subsumes that of B. bool DoesSubsume = - A->Kind == B->Kind && + A->getFoldOperator() == B->getFoldOperator() && FoldExpandedConstraint::AreCompatibleForSubsumption(*A, *B) && - Subsumes(&A->Constraint, &B->Constraint); + Subsumes(&A->getNormalizedPattern(), &B->getNormalizedPattern()); It = FoldSubsumptionCache.try_emplace(std::move(Key), DoesSubsume).first; } return It->second; diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 16d42d2..d27f767 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -17876,13 +17876,15 @@ Decl *Sema::BuildStaticAssertDeclaration(SourceLocation StaticAssertLoc, findFailedBooleanCondition(Converted.get()); if (const auto *ConceptIDExpr = dyn_cast_or_null<ConceptSpecializationExpr>(InnerCond)) { - // Drill down into concept specialization expressions to see why they - // weren't satisfied. - Diag(AssertExpr->getBeginLoc(), diag::err_static_assert_failed) - << !HasMessage << Msg.str() << AssertExpr->getSourceRange(); - ConstraintSatisfaction Satisfaction; - if (!CheckConstraintSatisfaction(ConceptIDExpr, Satisfaction)) - DiagnoseUnsatisfiedConstraint(Satisfaction); + const ASTConstraintSatisfaction &Satisfaction = + ConceptIDExpr->getSatisfaction(); + if (!Satisfaction.ContainsErrors || Satisfaction.NumRecords) { + Diag(AssertExpr->getBeginLoc(), diag::err_static_assert_failed) + << !HasMessage << Msg.str() << AssertExpr->getSourceRange(); + // Drill down into concept specialization expressions to see why they + // weren't satisfied. + DiagnoseUnsatisfiedConstraint(ConceptIDExpr); + } } else if (InnerCond && !isa<CXXBoolLiteralExpr>(InnerCond) && !isa<IntegerLiteral>(InnerCond)) { Diag(InnerCond->getBeginLoc(), diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index 576eb32..0fe242dce 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -7935,21 +7935,27 @@ Sema::BuildExprRequirement( // be satisfied. TemplateParameterList *TPL = ReturnTypeRequirement.getTypeConstraintTemplateParameterList(); - QualType MatchedType = - Context.getReferenceQualifiedType(E).getCanonicalType(); + QualType MatchedType = Context.getReferenceQualifiedType(E); llvm::SmallVector<TemplateArgument, 1> Args; Args.push_back(TemplateArgument(MatchedType)); auto *Param = cast<TemplateTypeParmDecl>(TPL->getParam(0)); - MultiLevelTemplateArgumentList MLTAL(Param, Args, /*Final=*/false); + MultiLevelTemplateArgumentList MLTAL(Param, Args, /*Final=*/true); MLTAL.addOuterRetainedLevels(TPL->getDepth()); const TypeConstraint *TC = Param->getTypeConstraint(); assert(TC && "Type Constraint cannot be null here"); auto *IDC = TC->getImmediatelyDeclaredConstraint(); assert(IDC && "ImmediatelyDeclaredConstraint can't be null here."); ExprResult Constraint = SubstExpr(IDC, MLTAL); - if (Constraint.isInvalid()) { + bool HasError = Constraint.isInvalid(); + if (!HasError) { + SubstitutedConstraintExpr = + cast<ConceptSpecializationExpr>(Constraint.get()); + if (SubstitutedConstraintExpr->getSatisfaction().ContainsErrors) + HasError = true; + } + if (HasError) { return new (Context) concepts::ExprRequirement( createSubstDiagAt(IDC->getExprLoc(), [&](llvm::raw_ostream &OS) { @@ -7958,8 +7964,6 @@ Sema::BuildExprRequirement( }), IsSimple, NoexceptLoc, ReturnTypeRequirement); } - SubstitutedConstraintExpr = - cast<ConceptSpecializationExpr>(Constraint.get()); if (!SubstitutedConstraintExpr->isSatisfied()) Status = concepts::ExprRequirement::SS_ConstraintsNotSatisfied; } diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index c971293..0d0d2c0 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -8219,8 +8219,8 @@ ExprResult InitializationSequence::Perform(Sema &S, // InitializeTemporary entity for our target type. QualType Ty = Step->Type; bool IsTemporary = !S.Context.hasSameType(Entity.getType(), Ty); - InitializedEntity TempEntity = InitializedEntity::InitializeTemporary(Ty); - InitializedEntity InitEntity = IsTemporary ? TempEntity : Entity; + InitializedEntity InitEntity = + IsTemporary ? InitializedEntity::InitializeTemporary(Ty) : Entity; InitListChecker PerformInitList(S, InitEntity, InitList, Ty, /*VerifyOnly=*/false, /*TreatUnavailableAsInvalid=*/false); @@ -8242,7 +8242,6 @@ ExprResult InitializationSequence::Perform(Sema &S, InitListExpr *StructuredInitList = PerformInitList.getFullyStructuredList(); - CurInit.get(); CurInit = shouldBindAsTemporary(InitEntity) ? S.MaybeBindToTemporary(StructuredInitList) : StructuredInitList; diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index ea5c4265..b870114 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -804,7 +804,7 @@ clang::MakeDeductionFailureInfo(ASTContext &Context, case TemplateDeductionResult::ConstraintsNotSatisfied: { CNSInfo *Saved = new (Context) CNSInfo; Saved->TemplateArgs = Info.takeSugared(); - Saved->Satisfaction = Info.AssociatedConstraintsSatisfaction; + Saved->Satisfaction = std::move(Info.AssociatedConstraintsSatisfaction); Result.Data = Saved; break; } @@ -852,6 +852,7 @@ void DeductionFailureInfo::Destroy() { case TemplateDeductionResult::ConstraintsNotSatisfied: // FIXME: Destroy the template argument list? + static_cast<CNSInfo *>(Data)->Satisfaction.~ConstraintSatisfaction(); Data = nullptr; if (PartialDiagnosticAt *Diag = getSFINAEDiagnostic()) { Diag->~PartialDiagnosticAt(); @@ -12739,7 +12740,8 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand, << (unsigned)FnKindPair.first << (unsigned)ocs_non_template << FnDesc /* Ignored */; ConstraintSatisfaction Satisfaction; - if (S.CheckFunctionConstraints(Fn, Satisfaction)) + if (S.CheckFunctionConstraints(Fn, Satisfaction, SourceLocation(), + /*ForOverloadResolution=*/true)) break; S.DiagnoseUnsatisfiedConstraint(Satisfaction); } diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 2bf1511..dcf2876 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #include "TreeTransform.h" +#include "clang/AST/ASTConcept.h" #include "clang/AST/ASTConsumer.h" #include "clang/AST/ASTContext.h" #include "clang/AST/Decl.h" @@ -1222,8 +1223,9 @@ static ExprResult formImmediatelyDeclaredConstraint( if (auto *CD = dyn_cast<ConceptDecl>(NamedConcept)) { ImmediatelyDeclaredConstraint = S.CheckConceptTemplateId( SS, /*TemplateKWLoc=*/SourceLocation(), NameInfo, - /*FoundDecl=*/FoundDecl ? FoundDecl : NamedConcept, CD, - &ConstraintArgs); + /*FoundDecl=*/FoundDecl ? FoundDecl : CD, CD, &ConstraintArgs, + /*DoCheckConstraintSatisfaction=*/ + !S.inParameterMappingSubstitution()); } // We have a template template parameter else { @@ -4850,13 +4852,11 @@ void Sema::diagnoseMissingTemplateArguments(const CXXScopeSpec &SS, diagnoseMissingTemplateArguments(Name, Loc); } -ExprResult -Sema::CheckConceptTemplateId(const CXXScopeSpec &SS, - SourceLocation TemplateKWLoc, - const DeclarationNameInfo &ConceptNameInfo, - NamedDecl *FoundDecl, - ConceptDecl *NamedConcept, - const TemplateArgumentListInfo *TemplateArgs) { +ExprResult Sema::CheckConceptTemplateId( + const CXXScopeSpec &SS, SourceLocation TemplateKWLoc, + const DeclarationNameInfo &ConceptNameInfo, NamedDecl *FoundDecl, + TemplateDecl *NamedConcept, const TemplateArgumentListInfo *TemplateArgs, + bool DoCheckConstraintSatisfaction) { assert(NamedConcept && "A concept template id without a template?"); if (NamedConcept->isInvalidDecl()) @@ -4873,33 +4873,48 @@ Sema::CheckConceptTemplateId(const CXXScopeSpec &SS, DiagnoseUseOfDecl(NamedConcept, ConceptNameInfo.getLoc()); + // There's a bug with CTAI.CanonicalConverted. + // If the template argument contains a DependentDecltypeType that includes a + // TypeAliasType, and the same written type had occurred previously in the + // source, then the DependentDecltypeType would be canonicalized to that + // previous type which would mess up the substitution. + // FIXME: Reland https://github.com/llvm/llvm-project/pull/101782 properly! auto *CSD = ImplicitConceptSpecializationDecl::Create( Context, NamedConcept->getDeclContext(), NamedConcept->getLocation(), - CTAI.CanonicalConverted); + CTAI.SugaredConverted); ConstraintSatisfaction Satisfaction; bool AreArgsDependent = TemplateSpecializationType::anyDependentTemplateArguments( - *TemplateArgs, CTAI.CanonicalConverted); - MultiLevelTemplateArgumentList MLTAL(NamedConcept, CTAI.CanonicalConverted, + *TemplateArgs, CTAI.SugaredConverted); + MultiLevelTemplateArgumentList MLTAL(NamedConcept, CTAI.SugaredConverted, /*Final=*/false); - LocalInstantiationScope Scope(*this); - - EnterExpressionEvaluationContext EECtx{ - *this, ExpressionEvaluationContext::Unevaluated, CSD}; - - if (!AreArgsDependent && - CheckConstraintSatisfaction( - NamedConcept, AssociatedConstraint(NamedConcept->getConstraintExpr()), - MLTAL, - SourceRange(SS.isSet() ? SS.getBeginLoc() : ConceptNameInfo.getLoc(), - TemplateArgs->getRAngleLoc()), - Satisfaction)) - return ExprError(); auto *CL = ConceptReference::Create( Context, SS.isSet() ? SS.getWithLocInContext(Context) : NestedNameSpecifierLoc{}, TemplateKWLoc, ConceptNameInfo, FoundDecl, NamedConcept, ASTTemplateArgumentListInfo::Create(Context, *TemplateArgs)); + + bool Error = false; + if (const auto *Concept = dyn_cast<ConceptDecl>(NamedConcept); + Concept && Concept->getConstraintExpr() && !AreArgsDependent && + DoCheckConstraintSatisfaction) { + + LocalInstantiationScope Scope(*this); + + EnterExpressionEvaluationContext EECtx{ + *this, ExpressionEvaluationContext::Unevaluated, CSD}; + + Error = CheckConstraintSatisfaction( + NamedConcept, AssociatedConstraint(Concept->getConstraintExpr()), MLTAL, + SourceRange(SS.isSet() ? SS.getBeginLoc() : ConceptNameInfo.getLoc(), + TemplateArgs->getRAngleLoc()), + Satisfaction, CL); + Satisfaction.ContainsErrors = Error; + } + + if (Error) + return ExprError(); + return ConceptSpecializationExpr::Create( Context, CL, CSD, AreArgsDependent ? nullptr : &Satisfaction); } @@ -5217,10 +5232,11 @@ bool Sema::CheckTemplateTypeArgument( } default: { // We allow instantiating a template with template argument packs when - // building deduction guides. + // building deduction guides or mapping constraint template parameters. if (Arg.getKind() == TemplateArgument::Pack && - CodeSynthesisContexts.back().Kind == - Sema::CodeSynthesisContext::BuildingDeductionGuides) { + (CodeSynthesisContexts.back().Kind == + Sema::CodeSynthesisContext::BuildingDeductionGuides || + inParameterMappingSubstitution())) { SugaredConverted.push_back(Arg); CanonicalConverted.push_back(Arg); return false; @@ -5813,6 +5829,20 @@ bool Sema::CheckTemplateArgumentList( TemplateArgumentListInfo &TemplateArgs, const DefaultArguments &DefaultArgs, bool PartialTemplateArgs, CheckTemplateArgumentInfo &CTAI, bool UpdateArgsWithConversions, bool *ConstraintsNotSatisfied) { + return CheckTemplateArgumentList( + Template, GetTemplateParameterList(Template), TemplateLoc, TemplateArgs, + DefaultArgs, PartialTemplateArgs, CTAI, UpdateArgsWithConversions, + ConstraintsNotSatisfied); +} + +/// Check that the given template argument list is well-formed +/// for specializing the given template. +bool Sema::CheckTemplateArgumentList( + TemplateDecl *Template, TemplateParameterList *Params, + SourceLocation TemplateLoc, TemplateArgumentListInfo &TemplateArgs, + const DefaultArguments &DefaultArgs, bool PartialTemplateArgs, + CheckTemplateArgumentInfo &CTAI, bool UpdateArgsWithConversions, + bool *ConstraintsNotSatisfied) { if (ConstraintsNotSatisfied) *ConstraintsNotSatisfied = false; @@ -5822,8 +5852,6 @@ bool Sema::CheckTemplateArgumentList( // template. TemplateArgumentListInfo NewArgs = TemplateArgs; - TemplateParameterList *Params = GetTemplateParameterList(Template); - SourceLocation RAngleLoc = NewArgs.getRAngleLoc(); // C++23 [temp.arg.general]p1: @@ -6163,11 +6191,12 @@ bool Sema::CheckTemplateArgumentList( CXXThisScopeRAII Scope(*this, RD, ThisQuals, RD != nullptr); MultiLevelTemplateArgumentList MLTAL = getTemplateInstantiationArgs( - Template, NewContext, /*Final=*/false, CTAI.CanonicalConverted, + Template, NewContext, /*Final=*/true, CTAI.SugaredConverted, /*RelativeToPrimary=*/true, /*Pattern=*/nullptr, /*ForConceptInstantiation=*/true); - if (EnsureTemplateArgumentListConstraints( + if (!isa<ConceptDecl>(Template) && + EnsureTemplateArgumentListConstraints( Template, MLTAL, SourceRange(TemplateLoc, TemplateArgs.getRAngleLoc()))) { if (ConstraintsNotSatisfied) diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index f6ee745..6bba505 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -3206,7 +3206,7 @@ CheckDeducedArgumentConstraints(Sema &S, NamedDecl *Template, // If we don't need to replace the deduced template arguments, // we can add them immediately as the inner-most argument list. if (!DeducedArgsNeedReplacement) - Innermost = CanonicalDeducedArgs; + Innermost = SugaredDeducedArgs; MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs( Template, Template->getDeclContext(), /*Final=*/false, Innermost, @@ -3218,7 +3218,7 @@ CheckDeducedArgumentConstraints(Sema &S, NamedDecl *Template, // not class-scope explicit specialization, so replace with Deduced Args // instead of adding to inner-most. if (!Innermost) - MLTAL.replaceInnermostTemplateArguments(Template, CanonicalDeducedArgs); + MLTAL.replaceInnermostTemplateArguments(Template, SugaredDeducedArgs); if (S.CheckConstraintSatisfaction(Template, AssociatedConstraints, MLTAL, Info.getLocation(), @@ -3995,11 +3995,12 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction( if (CheckFunctionTemplateConstraints( Info.getLocation(), FunctionTemplate->getCanonicalDecl()->getTemplatedDecl(), - CTAI.CanonicalConverted, Info.AssociatedConstraintsSatisfaction)) + CTAI.SugaredConverted, Info.AssociatedConstraintsSatisfaction)) return TemplateDeductionResult::MiscellaneousDeductionFailure; if (!Info.AssociatedConstraintsSatisfaction.IsSatisfied) { - Info.reset(Info.takeSugared(), TemplateArgumentList::CreateCopy( - Context, CTAI.CanonicalConverted)); + Info.reset( + TemplateArgumentList::CreateCopy(Context, CTAI.SugaredConverted), + Info.takeCanonical()); return TemplateDeductionResult::ConstraintsNotSatisfied; } } @@ -5167,8 +5168,8 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type, /*DefaultArgs=*/{}, /*PartialTemplateArgs=*/false, CTAI)) return true; - MultiLevelTemplateArgumentList MLTAL(Concept, CTAI.CanonicalConverted, - /*Final=*/false); + MultiLevelTemplateArgumentList MLTAL(Concept, CTAI.SugaredConverted, + /*Final=*/true); // Build up an EvaluationContext with an ImplicitConceptSpecializationDecl so // that the template arguments of the constraint can be preserved. For // example: @@ -5182,7 +5183,7 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type, S, Sema::ExpressionEvaluationContext::Unevaluated, ImplicitConceptSpecializationDecl::Create( S.getASTContext(), Concept->getDeclContext(), Concept->getLocation(), - CTAI.CanonicalConverted)); + CTAI.SugaredConverted)); if (S.CheckConstraintSatisfaction( Concept, AssociatedConstraint(Concept->getConstraintExpr()), MLTAL, TypeLoc.getLocalSourceRange(), Satisfaction)) @@ -6676,10 +6677,11 @@ namespace { struct MarkUsedTemplateParameterVisitor : DynamicRecursiveASTVisitor { llvm::SmallBitVector &Used; unsigned Depth; + bool VisitDeclRefTypes = true; - MarkUsedTemplateParameterVisitor(llvm::SmallBitVector &Used, - unsigned Depth) - : Used(Used), Depth(Depth) { } + MarkUsedTemplateParameterVisitor(llvm::SmallBitVector &Used, unsigned Depth, + bool VisitDeclRefTypes = true) + : Used(Used), Depth(Depth), VisitDeclRefTypes(VisitDeclRefTypes) {} bool VisitTemplateTypeParmType(TemplateTypeParmType *T) override { if (T->getDepth() == Depth) @@ -6700,6 +6702,8 @@ struct MarkUsedTemplateParameterVisitor : DynamicRecursiveASTVisitor { if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(E->getDecl())) if (NTTP->getDepth() == Depth) Used[NTTP->getIndex()] = true; + if (VisitDeclRefTypes) + DynamicRecursiveASTVisitor::TraverseType(E->getType()); return true; } @@ -7043,10 +7047,13 @@ MarkUsedTemplateParameters(ASTContext &Ctx, QualType T, break; case Type::UnaryTransform: - if (!OnlyDeduced) - MarkUsedTemplateParameters(Ctx, - cast<UnaryTransformType>(T)->getUnderlyingType(), - OnlyDeduced, Depth, Used); + if (!OnlyDeduced) { + auto *UTT = cast<UnaryTransformType>(T); + auto Next = UTT->getUnderlyingType(); + if (Next.isNull()) + Next = UTT->getBaseType(); + MarkUsedTemplateParameters(Ctx, Next, OnlyDeduced, Depth, Used); + } break; case Type::PackExpansion: @@ -7146,6 +7153,12 @@ Sema::MarkUsedTemplateParameters(const Expr *E, bool OnlyDeduced, ::MarkUsedTemplateParameters(Context, E, OnlyDeduced, Depth, Used); } +void Sema::MarkUsedTemplateParametersForSubsumptionParameterMapping( + const Expr *E, unsigned Depth, llvm::SmallBitVector &Used) { + MarkUsedTemplateParameterVisitor(Used, Depth, /*VisitDeclRefTypes=*/false) + .TraverseStmt(const_cast<Expr *>(E)); +} + void Sema::MarkUsedTemplateParameters(const TemplateArgumentList &TemplateArgs, bool OnlyDeduced, unsigned Depth, @@ -7171,6 +7184,14 @@ void Sema::MarkUsedTemplateParameters(ArrayRef<TemplateArgument> TemplateArgs, /*OnlyDeduced=*/false, Depth, Used); } +void Sema::MarkUsedTemplateParameters( + ArrayRef<TemplateArgumentLoc> TemplateArgs, unsigned Depth, + llvm::SmallBitVector &Used) { + for (unsigned I = 0, N = TemplateArgs.size(); I != N; ++I) + ::MarkUsedTemplateParameters(Context, TemplateArgs[I].getArgument(), + /*OnlyDeduced=*/false, Depth, Used); +} + void Sema::MarkDeducedTemplateParameters( ASTContext &Ctx, const FunctionTemplateDecl *FunctionTemplate, llvm::SmallBitVector &Deduced) { diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp index fe673ea..9a61888 100644 --- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp +++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp @@ -1171,17 +1171,46 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef, Args.addOuterTemplateArguments(TransformedDeducedAliasArgs); for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) { const auto &D = DeduceResults[Index]; + auto *TP = F->getTemplateParameters()->getParam(Index); if (IsNonDeducedArgument(D)) { // 2): Non-deduced template parameters would be substituted later. continue; } TemplateArgumentLoc Input = SemaRef.getTrivialTemplateArgumentLoc(D, QualType(), SourceLocation{}); - TemplateArgumentLoc Output; - if (!SemaRef.SubstTemplateArgument(Input, Args, Output)) { - assert(TemplateArgsForBuildingFPrime[Index].isNull() && - "InstantiatedArgs must be null before setting"); - TemplateArgsForBuildingFPrime[Index] = Output.getArgument(); + TemplateArgumentListInfo Output; + if (SemaRef.SubstTemplateArguments(Input, Args, Output)) + return nullptr; + assert(TemplateArgsForBuildingFPrime[Index].isNull() && + "InstantiatedArgs must be null before setting"); + // CheckTemplateArgument is necessary for NTTP initializations. + // FIXME: We may want to call CheckTemplateArguments instead, but we cannot + // match packs as usual, since packs can appear in the middle of the + // parameter list of a synthesized CTAD guide. See also the FIXME in + // test/SemaCXX/cxx20-ctad-type-alias.cpp:test25. + Sema::CheckTemplateArgumentInfo CTAI; + if (Input.getArgument().getKind() == TemplateArgument::Pack) { + for (auto TA : Output.arguments()) { + if (SemaRef.CheckTemplateArgument( + TP, TA, F, F->getLocation(), F->getLocation(), + /*ArgumentPackIndex=*/-1, CTAI, + Sema::CheckTemplateArgumentKind::CTAK_Specified)) + return nullptr; + } + // We will substitute the non-deduced template arguments with these + // transformed (unpacked at this point) arguments, where that substitution + // requires a pack for the corresponding parameter packs. + TemplateArgsForBuildingFPrime[Index] = + TemplateArgument::CreatePackCopy(Context, CTAI.SugaredConverted); + } else { + assert(Output.arguments().size() == 1); + TemplateArgumentLoc Transformed = Output.arguments()[0]; + if (SemaRef.CheckTemplateArgument( + TP, Transformed, F, F->getLocation(), F->getLocation(), + /*ArgumentPackIndex=*/-1, CTAI, + Sema::CheckTemplateArgumentKind::CTAK_Specified)) + return nullptr; + TemplateArgsForBuildingFPrime[Index] = CTAI.SugaredConverted[0]; } } diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index f1c9c5c..1f762ca 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -628,9 +628,14 @@ Sema::InstantiatingTemplate::InstantiatingTemplate( Inst.InstantiationRange = InstantiationRange; Inst.InConstraintSubstitution = Inst.Kind == CodeSynthesisContext::ConstraintSubstitution; - if (!SemaRef.CodeSynthesisContexts.empty()) + Inst.InParameterMappingSubstitution = + Inst.Kind == CodeSynthesisContext::ParameterMappingSubstitution; + if (!SemaRef.CodeSynthesisContexts.empty()) { Inst.InConstraintSubstitution |= SemaRef.CodeSynthesisContexts.back().InConstraintSubstitution; + Inst.InParameterMappingSubstitution |= + SemaRef.CodeSynthesisContexts.back().InParameterMappingSubstitution; + } Invalid = SemaRef.pushCodeSynthesisContext(Inst); if (!Invalid) { @@ -1375,6 +1380,7 @@ std::optional<TemplateDeductionInfo *> Sema::isSFINAEContext() const { // Template Instantiation for Types //===----------------------------------------------------------------------===/ namespace { + class TemplateInstantiator : public TreeTransform<TemplateInstantiator> { const MultiLevelTemplateArgumentList &TemplateArgs; SourceLocation Loc; @@ -1387,7 +1393,11 @@ namespace { // Whether an incomplete substituion should be treated as an error. bool BailOutOnIncomplete; - private: + // Whether to rebuild pack expansion types; We don't do that when + // rebuilding the parameter mapping of a fold expression appearing + // in a constraint expression. + bool BuildPackExpansionTypes = true; + // CWG2770: Function parameters should be instantiated when they are // needed by a satisfaction check of an atomic constraint or // (recursively) by another function parameter. @@ -1410,6 +1420,17 @@ namespace { return EvaluateConstraints; } + inline static struct ForParameterMappingSubstitution_t { + } ForParameterMappingSubstitution; + + TemplateInstantiator(ForParameterMappingSubstitution_t, Sema &SemaRef, + SourceLocation Loc, + const MultiLevelTemplateArgumentList &TemplateArgs, + bool BuildPackExpansionTypes) + : inherited(SemaRef), TemplateArgs(TemplateArgs), Loc(Loc), + BailOutOnIncomplete(false), + BuildPackExpansionTypes(BuildPackExpansionTypes) {} + /// Determine whether the given type \p T has already been /// transformed. /// @@ -1444,7 +1465,8 @@ namespace { bool &ShouldExpand, bool &RetainExpansion, UnsignedOrNone &NumExpansions) { if (SemaRef.CurrentInstantiationScope && - SemaRef.inConstraintSubstitution()) { + (SemaRef.inConstraintSubstitution() || + SemaRef.inParameterMappingSubstitution())) { for (UnexpandedParameterPack ParmPack : Unexpanded) { NamedDecl *VD = ParmPack.first.dyn_cast<NamedDecl *>(); if (auto *PVD = dyn_cast_if_present<ParmVarDecl>(VD); @@ -1465,10 +1487,10 @@ namespace { TemplateArgument ForgetPartiallySubstitutedPack() { TemplateArgument Result; - if (NamedDecl *PartialPack - = SemaRef.CurrentInstantiationScope->getPartiallySubstitutedPack()){ - MultiLevelTemplateArgumentList &TemplateArgs - = const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs); + if (NamedDecl *PartialPack = SemaRef.CurrentInstantiationScope + ->getPartiallySubstitutedPack()) { + MultiLevelTemplateArgumentList &TemplateArgs = + const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs); unsigned Depth, Index; std::tie(Depth, Index) = getDepthAndIndex(PartialPack); if (TemplateArgs.hasTemplateArgument(Depth, Index)) { @@ -1488,10 +1510,10 @@ namespace { if (Arg.isNull()) return; - if (NamedDecl *PartialPack - = SemaRef.CurrentInstantiationScope->getPartiallySubstitutedPack()){ - MultiLevelTemplateArgumentList &TemplateArgs - = const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs); + if (NamedDecl *PartialPack = SemaRef.CurrentInstantiationScope + ->getPartiallySubstitutedPack()) { + MultiLevelTemplateArgumentList &TemplateArgs = + const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs); unsigned Depth, Index; std::tie(Depth, Index) = getDepthAndIndex(PartialPack); TemplateArgs.setArgument(Depth, Index, Arg); @@ -1508,9 +1530,9 @@ namespace { std::move(New); return Old; } + void RememberSubstitution(MultiLevelTemplateArgumentList Old) { - const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs) = - std::move(Old); + const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs) = Old; } TemplateArgument @@ -1691,6 +1713,24 @@ namespace { return inherited::TransformTemplateArgument(Input, Output, Uneval); } + // This has to be here to allow its overload. + ExprResult RebuildPackExpansion(Expr *Pattern, SourceLocation EllipsisLoc, + UnsignedOrNone NumExpansions) { + return inherited::RebuildPackExpansion(Pattern, EllipsisLoc, + NumExpansions); + } + + TemplateArgumentLoc RebuildPackExpansion(TemplateArgumentLoc Pattern, + SourceLocation EllipsisLoc, + UnsignedOrNone NumExpansions) { + // We don't rewrite a PackExpansion type when we want to normalize a + // CXXFoldExpr constraint. We'll expand it when evaluating the constraint. + if (BuildPackExpansionTypes) + return inherited::RebuildPackExpansion(Pattern, EllipsisLoc, + NumExpansions); + return Pattern; + } + using TreeTransform::TransformTemplateSpecializationType; QualType TransformTemplateSpecializationType(TypeLocBuilder &TLB, @@ -1961,7 +2001,8 @@ Decl *TemplateInstantiator::TransformDecl(SourceLocation Loc, Decl *D) { if (ParmVarDecl *PVD = dyn_cast<ParmVarDecl>(D); PVD && SemaRef.CurrentInstantiationScope && - SemaRef.inConstraintSubstitution() && + (SemaRef.inConstraintSubstitution() || + SemaRef.inParameterMappingSubstitution()) && maybeInstantiateFunctionParameterToScope(PVD)) return nullptr; @@ -2759,18 +2800,29 @@ TemplateInstantiator::TransformExprRequirement(concepts::ExprRequirement *Req) { concepts::NestedRequirement * TemplateInstantiator::TransformNestedRequirement( concepts::NestedRequirement *Req) { - if (!Req->isDependent() && !AlwaysRebuild()) - return Req; + + ASTContext &C = SemaRef.Context; + + Expr *Constraint = Req->getConstraintExpr(); + ConstraintSatisfaction Satisfaction; + + auto NestedReqWithDiag = [&C, this](Expr *E, + ConstraintSatisfaction Satisfaction) { + Satisfaction.IsSatisfied = false; + SmallString<128> Entity; + llvm::raw_svector_ostream OS(Entity); + E->printPretty(OS, nullptr, SemaRef.getPrintingPolicy()); + return new (C) concepts::NestedRequirement( + SemaRef.Context, C.backupStr(Entity), std::move(Satisfaction)); + }; + if (Req->hasInvalidConstraint()) { if (AlwaysRebuild()) return RebuildNestedRequirement(Req->getInvalidConstraintEntity(), Req->getConstraintSatisfaction()); return Req; } - Sema::InstantiatingTemplate ReqInst(SemaRef, - Req->getConstraintExpr()->getBeginLoc(), Req, - Sema::InstantiatingTemplate::ConstraintsCheck{}, - Req->getConstraintExpr()->getSourceRange()); + if (!getEvaluateConstraints()) { ExprResult TransConstraint = TransformExpr(Req->getConstraintExpr()); if (TransConstraint.isInvalid() || !TransConstraint.get()) @@ -2783,45 +2835,45 @@ TemplateInstantiator::TransformNestedRequirement( SemaRef.Context, TransConstraint.get(), Satisfaction); } - ExprResult TransConstraint; - ConstraintSatisfaction Satisfaction; - TemplateDeductionInfo Info(Req->getConstraintExpr()->getBeginLoc()); + bool Success; + Expr *NewConstraint; + TemplateDeductionInfo Info(Constraint->getBeginLoc()); { EnterExpressionEvaluationContext ContextRAII( SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated); - Sema::SFINAETrap Trap(SemaRef); - Sema::InstantiatingTemplate ConstrInst(SemaRef, - Req->getConstraintExpr()->getBeginLoc(), Req, Info, - Req->getConstraintExpr()->getSourceRange()); + + Sema::InstantiatingTemplate ConstrInst( + SemaRef, Constraint->getBeginLoc(), Req, + Sema::InstantiatingTemplate::ConstraintsCheck(), + Constraint->getSourceRange()); + if (ConstrInst.isInvalid()) return nullptr; - llvm::SmallVector<Expr *> Result; - if (!SemaRef.CheckConstraintSatisfaction( - nullptr, - AssociatedConstraint(Req->getConstraintExpr(), - SemaRef.ArgPackSubstIndex), - Result, TemplateArgs, Req->getConstraintExpr()->getSourceRange(), - Satisfaction) && - !Result.empty()) - TransConstraint = Result[0]; - assert(!Trap.hasErrorOccurred() && "Substitution failures must be handled " - "by CheckConstraintSatisfaction."); + + Sema::SFINAETrap Trap(SemaRef); + + Success = !SemaRef.CheckConstraintSatisfaction( + Req, AssociatedConstraint(Constraint, SemaRef.ArgPackSubstIndex), + TemplateArgs, Constraint->getSourceRange(), Satisfaction, + /*TopLevelConceptId=*/nullptr, &NewConstraint); + + assert(!Success || !Trap.hasErrorOccurred() && + "Substitution failures must be handled " + "by CheckConstraintSatisfaction."); } - ASTContext &C = SemaRef.Context; - if (TransConstraint.isUsable() && - TransConstraint.get()->isInstantiationDependent()) - return new (C) concepts::NestedRequirement(TransConstraint.get()); - if (TransConstraint.isInvalid() || !TransConstraint.get() || - Satisfaction.HasSubstitutionFailure()) { - SmallString<128> Entity; - llvm::raw_svector_ostream OS(Entity); - Req->getConstraintExpr()->printPretty(OS, nullptr, - SemaRef.getPrintingPolicy()); - return new (C) concepts::NestedRequirement( - SemaRef.Context, C.backupStr(Entity), Satisfaction); + + if (!Success || Satisfaction.HasSubstitutionFailure()) + return NestedReqWithDiag(Constraint, Satisfaction); + + // FIXME: const correctness + // MLTAL might be dependent. + if (!NewConstraint) { + if (!Satisfaction.IsSatisfied) + return NestedReqWithDiag(Constraint, Satisfaction); + + NewConstraint = Constraint; } - return new (C) - concepts::NestedRequirement(C, TransConstraint.get(), Satisfaction); + return new (C) concepts::NestedRequirement(C, NewConstraint, Satisfaction); } TypeSourceInfo *Sema::SubstType(TypeSourceInfo *T, @@ -3078,7 +3130,7 @@ bool Sema::SubstTypeConstraint( const ASTTemplateArgumentListInfo *TemplArgInfo = TC->getTemplateArgsAsWritten(); - if (!EvaluateConstraints) { + if (!EvaluateConstraints && !inParameterMappingSubstitution()) { UnsignedOrNone Index = TC->getArgPackSubstIndex(); if (!Index) Index = SemaRef.ArgPackSubstIndex; @@ -4378,6 +4430,16 @@ bool Sema::SubstTemplateArguments( return Instantiator.TransformTemplateArguments(Args.begin(), Args.end(), Out); } +bool Sema::SubstTemplateArgumentsInParameterMapping( + ArrayRef<TemplateArgumentLoc> Args, SourceLocation BaseLoc, + const MultiLevelTemplateArgumentList &TemplateArgs, + TemplateArgumentListInfo &Out, bool BuildPackExpansionTypes) { + TemplateInstantiator Instantiator( + TemplateInstantiator::ForParameterMappingSubstitution, *this, BaseLoc, + TemplateArgs, BuildPackExpansionTypes); + return Instantiator.TransformTemplateArguments(Args.begin(), Args.end(), Out); +} + ExprResult Sema::SubstExpr(Expr *E, const MultiLevelTemplateArgumentList &TemplateArgs) { if (!E) diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 6967301..51b55b8 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -3722,10 +3722,6 @@ public: ParentContext); } - /// Build a new Objective-C boxed expression. - /// - /// By default, performs semantic analysis to build the new expression. - /// Subclasses may override this routine to provide different behavior. ExprResult RebuildConceptSpecializationExpr(NestedNameSpecifierLoc NNS, SourceLocation TemplateKWLoc, DeclarationNameInfo ConceptNameInfo, NamedDecl *FoundDecl, ConceptDecl *NamedConcept, @@ -5110,9 +5106,13 @@ bool TreeTransform<Derived>::TransformTemplateArguments( typedef TemplateArgumentLocInventIterator<Derived, TemplateArgument::pack_iterator> PackLocIterator; + + TemplateArgumentListInfo *PackOutput = &Outputs; + TemplateArgumentListInfo New; + if (TransformTemplateArguments( PackLocIterator(*this, In.getArgument().pack_begin()), - PackLocIterator(*this, In.getArgument().pack_end()), Outputs, + PackLocIterator(*this, In.getArgument().pack_end()), *PackOutput, Uneval)) return true; @@ -5179,7 +5179,6 @@ bool TreeTransform<Derived>::TransformTemplateArguments( } return false; - } // FIXME: Find ways to reduce code duplication for pack expansions. @@ -6247,7 +6246,7 @@ ParmVarDecl *TreeTransform<Derived>::TransformFunctionTypeParam( /* DefArg */ nullptr); newParm->setScopeInfo(OldParm->getFunctionScopeDepth(), OldParm->getFunctionScopeIndex() + indexAdjustment); - transformedLocalDecl(OldParm, {newParm}); + getDerived().transformedLocalDecl(OldParm, {newParm}); return newParm; } @@ -7082,11 +7081,11 @@ QualType TreeTransform<Derived>::TransformUnaryTransformType( TypeLocBuilder &TLB, UnaryTransformTypeLoc TL) { QualType Result = TL.getType(); + TypeSourceInfo *NewBaseTSI = TL.getUnderlyingTInfo(); if (Result->isDependentType()) { const UnaryTransformType *T = TL.getTypePtr(); - TypeSourceInfo *NewBaseTSI = - getDerived().TransformType(TL.getUnderlyingTInfo()); + NewBaseTSI = getDerived().TransformType(TL.getUnderlyingTInfo()); if (!NewBaseTSI) return QualType(); QualType NewBase = NewBaseTSI->getType(); @@ -7101,7 +7100,7 @@ QualType TreeTransform<Derived>::TransformUnaryTransformType( UnaryTransformTypeLoc NewTL = TLB.push<UnaryTransformTypeLoc>(Result); NewTL.setKWLoc(TL.getKWLoc()); NewTL.setParensRange(TL.getParensRange()); - NewTL.setUnderlyingTInfo(TL.getUnderlyingTInfo()); + NewTL.setUnderlyingTInfo(NewBaseTSI); return Result; } diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index cf32d4f..5456e73 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -2424,7 +2424,7 @@ void ASTDeclReader::VisitImplicitConceptSpecializationDecl( VisitDecl(D); llvm::SmallVector<TemplateArgument, 4> Args; for (unsigned I = 0; I < D->NumTemplateArgs; ++I) - Args.push_back(Record.readTemplateArgument(/*Canonicalize=*/true)); + Args.push_back(Record.readTemplateArgument(/*Canonicalize=*/false)); D->setTemplateArguments(Args); } diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index 70b898a..eef97a8 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -807,15 +807,19 @@ readConstraintSatisfaction(ASTRecordReader &Record) { if (!Satisfaction.IsSatisfied) { unsigned NumDetailRecords = Record.readInt(); for (unsigned i = 0; i != NumDetailRecords; ++i) { - if (/* IsDiagnostic */Record.readInt()) { + auto Kind = Record.readInt(); + if (Kind == 0) { SourceLocation DiagLocation = Record.readSourceLocation(); StringRef DiagMessage = C.backupStr(Record.readString()); - Satisfaction.Details.emplace_back( - new (C) ConstraintSatisfaction::SubstitutionDiagnostic( - DiagLocation, DiagMessage)); - } else + Satisfaction.Details.emplace_back(new ( + C) ConstraintSubstitutionDiagnostic(DiagLocation, DiagMessage)); + } else if (Kind == 1) { Satisfaction.Details.emplace_back(Record.readExpr()); + } else { + assert(Kind == 2); + Satisfaction.Details.emplace_back(Record.readConceptReference()); + } } } return Satisfaction; diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index ebda91e..acf3453 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -482,14 +482,20 @@ addConstraintSatisfaction(ASTRecordWriter &Record, if (!Satisfaction.IsSatisfied) { Record.push_back(Satisfaction.NumRecords); for (const auto &DetailRecord : Satisfaction) { - auto *E = dyn_cast<Expr *>(DetailRecord); - Record.push_back(/* IsDiagnostic */ E == nullptr); - if (E) - Record.AddStmt(E); - else { - auto *Diag = cast<std::pair<SourceLocation, StringRef> *>(DetailRecord); + if (auto *Diag = dyn_cast<const ConstraintSubstitutionDiagnostic *>( + DetailRecord)) { + Record.push_back(/*Kind=*/0); Record.AddSourceLocation(Diag->first); Record.AddString(Diag->second); + continue; + } + if (auto *E = dyn_cast<const Expr *>(DetailRecord)) { + Record.push_back(/*Kind=*/1); + Record.AddStmt(const_cast<Expr *>(E)); + } else { + Record.push_back(/*Kind=*/2); + auto *CR = cast<const ConceptReference *>(DetailRecord); + Record.AddConceptReference(CR); } } } diff --git a/clang/test/AST/ast-dump-concepts.cpp b/clang/test/AST/ast-dump-concepts.cpp index 84d981d..9419dba 100644 --- a/clang/test/AST/ast-dump-concepts.cpp +++ b/clang/test/AST/ast-dump-concepts.cpp @@ -20,8 +20,9 @@ struct Foo { // CHECK: TemplateTypeParmDecl {{.*}} referenced Concept {{.*}} 'binary_concept' // CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} <col:13, col:31> 'bool' Concept {{.*}} 'binary_concept' // CHECK-NEXT: |-ImplicitConceptSpecializationDecl {{.*}} <line:13:9> col:9 - // CHECK-NEXT: | |-TemplateArgument type 'type-parameter-1-0' - // CHECK-NEXT: | | `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent {{.*}}depth 1 index 0 + // CHECK-NEXT: | |-TemplateArgument type 'R' + // CHECK-NEXT: | | `-TemplateTypeParmType {{.*}} 'R' dependent {{.*}}depth 1 index 0 + // CHECK-NEXT: | | `-TemplateTypeParm {{.*}} 'R' // CHECK-NEXT: | `-TemplateArgument type 'int' // CHECK-NEXT: | `-BuiltinType {{.*}} 'int' // CHECK-NEXT: |-TemplateArgument {{.*}} type 'R' @@ -35,8 +36,9 @@ struct Foo { // CHECK: TemplateTypeParmDecl {{.*}} referenced Concept {{.*}} 'unary_concept' // CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} <col:13> 'bool' // CHECK-NEXT: |-ImplicitConceptSpecializationDecl {{.*}} <line:10:9> col:9 - // CHECK-NEXT: | `-TemplateArgument type 'type-parameter-1-0' - // CHECK-NEXT: | `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent {{.*}}depth 1 index 0 + // CHECK-NEXT: | `-TemplateArgument type 'R' + // CHECK-NEXT: | `-TemplateTypeParmType {{.*}} 'R' dependent {{.*}}depth 1 index 0 + // CHECK-NEXT: | `-TemplateTypeParm {{.*}} 'R' template <unary_concept R> Foo(R); diff --git a/clang/test/AST/ast-dump-ctad-alias.cpp b/clang/test/AST/ast-dump-ctad-alias.cpp index 781fb9f..9a3adbc 100644 --- a/clang/test/AST/ast-dump-ctad-alias.cpp +++ b/clang/test/AST/ast-dump-ctad-alias.cpp @@ -185,17 +185,18 @@ void foo() { // CHECK-NEXT: | |-BinaryOperator {{.*}} 'bool' '&&' // CHECK-NEXT: | | |-ConceptSpecializationExpr {{.*}} 'bool' Concept {{.*}} 'invocable' // CHECK-NEXT: | | | |-ImplicitConceptSpecializationDecl {{.*}} -// CHECK-NEXT: | | | | |-TemplateArgument type 'type-parameter-0-2' -// CHECK-NEXT: | | | | | `-TemplateTypeParmType {{.*}} 'type-parameter-0-2' dependent depth 0 index 2 -// CHECK-NEXT: | | | | `-TemplateArgument pack '<GH124715::Packs<type-parameter-0-1...>>' -// CHECK-NEXT: | | | | `-TemplateArgument type 'GH124715::Packs<type-parameter-0-1...>' -// CHECK-NEXT: | | | | `-TemplateSpecializationType {{.*}} 'GH124715::Packs<type-parameter-0-1...>' dependent -// CHECK-NEXT: | | | | |-name: 'GH124715::Packs' +// CHECK-NEXT: | | | | |-TemplateArgument type 'U' +// CHECK-NEXT: | | | | | `-TemplateTypeParmType {{.*}} 'U' dependent depth 0 index 2 +// CHECK-NEXT: | | | | | `-TemplateTypeParm {{.*}} 'U' +// CHECK-NEXT: | | | | `-TemplateArgument pack '<Packs<Ts...>>' +// CHECK-NEXT: | | | | `-TemplateArgument type 'Packs<Ts...>' +// CHECK-NEXT: | | | | `-TemplateSpecializationType {{.*}} 'Packs<Ts...>' dependent +// CHECK-NEXT: | | | | |-name: 'Packs':'GH124715::Packs' qualified // CHECK-NEXT: | | | | | `-ClassTemplateDecl {{.*}} Packs -// CHECK-NEXT: | | | | `-TemplateArgument pack '<type-parameter-0-1...>' -// CHECK-NEXT: | | | | `-TemplateArgument type 'type-parameter-0-1...' -// CHECK-NEXT: | | | | `-PackExpansionType {{.*}} 'type-parameter-0-1...' dependent -// CHECK-NEXT: | | | | `-TemplateTypeParmType {{.*}} 'type-parameter-0-1' dependent contains_unexpanded_pack depth 0 index 1 pack +// CHECK-NEXT: | | | | `-TemplateArgument type 'Ts...' +// CHECK-NEXT: | | | | `-PackExpansionType {{.*}} 'Ts...' dependent +// CHECK-NEXT: | | | | `-TemplateTypeParmType {{.*}} 'Ts' dependent contains_unexpanded_pack depth 0 index 1 pack +// CHECK-NEXT: | | | | `-TemplateTypeParm {{.*}} 'Ts' // CHECK-NEXT: | | | |-TemplateArgument {{.*}} type 'U':'type-parameter-0-2' // CHECK-NEXT: | | | | `-TemplateTypeParmType {{.*}} 'U' dependent depth 0 index 2 // CHECK-NEXT: | | | | `-TemplateTypeParm {{.*}} 'U' diff --git a/clang/test/CXX/drs/cwg25xx.cpp b/clang/test/CXX/drs/cwg25xx.cpp index 5c2948f..0e0fc73 100644 --- a/clang/test/CXX/drs/cwg25xx.cpp +++ b/clang/test/CXX/drs/cwg25xx.cpp @@ -243,19 +243,20 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07 // since-cxx20-note@#cwg2565-VC {{because 'b' would be invalid: argument may not have 'void' type}} template<typename T> - concept ErrorRequires = requires (ErrorRequires auto x) { + concept ErrorRequires = requires (ErrorRequires auto x) { // #cwg2565-expr // since-cxx20-error@-1 {{a concept definition cannot refer to itself}} // since-cxx20-note@-2 {{declared here}} // since-cxx20-error@-3 {{'auto' not allowed in requires expression parameter}} x; }; static_assert(ErrorRequires<int>); - // since-cxx20-error@-1 {{static assertion failed}} - // since-cxx20-note@-2 {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} + // since-cxx20-error@-1 {{static assertion failed}} \ + // since-cxx20-note@-1 {{because 'int' does not satisfy 'ErrorRequires'}} \ + // since-cxx20-note@#cwg2565-expr {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} template<typename T> concept NestedErrorInRequires = requires (T x) { // #cwg2565-NEIR - requires requires (NestedErrorInRequires auto y) { + requires requires (NestedErrorInRequires auto y) { // #cwg2565-NEIR-inner // since-cxx20-error@-1 {{a concept definition cannot refer to itself}} // since-cxx20-note@#cwg2565-NEIR {{declared here}} // since-cxx20-error@-3 {{'auto' not allowed in requires expression parameter}} @@ -263,8 +264,9 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07 }; }; static_assert(NestedErrorInRequires<int>); - // since-cxx20-error@-1 {{static assertion failed}} - // since-cxx20-note@-2 {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} + // since-cxx20-error@-1 {{static assertion failed}} \ + // since-cxx20-note@-1 {{because 'int' does not satisfy 'NestedErrorInRequires'}} \ + // since-cxx20-note-re@#cwg2565-NEIR-inner {{because {{.*}} would be invalid: constraint depends on a previously diagnosed expression}} #endif } // namespace cwg2565 diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp index 28b5d0a..af2fc93 100644 --- a/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp +++ b/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp @@ -140,7 +140,8 @@ concept C7 = sizeof(T) == 1 || sizeof( ::type) == 1; static_assert(!C6<short>); -static_assert(!C6<char>); // expected-note{{while checking the satisfaction of concept 'C6<char>' requested here}} +static_assert(!C6<char>); +// expected-note@-1 {{while checking the satisfaction of concept 'C6<char>' requested here}} static_assert(C7<char>); static_assert(!C7<short>); // expected-note{{while checking the satisfaction of concept 'C7<short>' requested here}} diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp index 31587a9..af2dce8 100644 --- a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp +++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp @@ -35,14 +35,14 @@ using r2i2 = r2<A>; // expected-error{{constraints not satisfied for class templ using r2i3 = r2<D>; using r2i4 = r2<const D>; // expected-error{{constraints not satisfied for class template 'r2' [with T = const D]}} -template<typename T> requires requires { { sizeof(T) }; } // expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}} expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'nonexistent'}} +template<typename T> requires requires { { sizeof(T) }; } // expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}} expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'class nonexistent'}} struct r3 {}; using r3i1 = r3<int>; using r3i2 = r3<A>; using r3i3 = r3<A &>; using r3i4 = r3<void>; // expected-error{{constraints not satisfied for class template 'r3' [with T = void]}} -using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = nonexistent]}} +using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = class nonexistent]}} // Non-dependent expressions @@ -89,7 +89,7 @@ template<typename T> concept Large = sizeof(typename remove_reference<T>::type) >= 4; // expected-note@-1{{because 'sizeof(typename remove_reference<short &>::type) >= 4' (2 >= 4) evaluated to false}} -template<typename T> requires requires (T t) { { t } -> Large; } // expected-note{{because 'short &' does not satisfy 'Large':}} +template<typename T> requires requires (T t) { { t } -> Large; } // expected-note{{because 'short &' does not satisfy 'Large'}} struct r7 {}; using r7i1 = r7<int>; @@ -149,7 +149,7 @@ namespace std_example { template<typename T> constexpr bool is_same_v<T, T> = true; template<typename T, typename U> concept same_as = is_same_v<T, U>; - // expected-note@-1 {{because 'is_same_v<int, int *>' evaluated to false}} + // expected-note@-1 {{because 'is_same_v<int, typename std_example::T2::inner>' evaluated to false}} static_assert(C1<int>); static_assert(C1<int*>); @@ -160,7 +160,7 @@ namespace std_example { template<typename T> concept C2 = requires(T x) { {*x} -> same_as<typename T::inner>; - // expected-note@-1{{because type constraint 'same_as<int, typename std_example::T2::inner>' was not satisfied:}} + // expected-note@-1{{because 'same_as<int, typename std_example::T2::inner>' evaluated to false}} // expected-note@-2{{because '*x' would be invalid: indirection requires pointer operand ('int' invalid)}} }; @@ -173,9 +173,9 @@ namespace std_example { int operator *() { return 0; } }; static_assert(C2<T1>); - template<C2 T> struct C2_check {}; // expected-note{{because 'int' does not satisfy 'C2'}} expected-note{{because 'std_example::T2' does not satisfy 'C2'}} + template<C2 T> struct C2_check {}; // expected-note{{because 'int' does not satisfy 'C2'}} expected-note{{because 'T2' does not satisfy 'C2'}} using c2c1 = C2_check<int>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = int]}} - using c2c2 = C2_check<T2>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = std_example::T2]}} + using c2c2 = C2_check<T2>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = T2]}} template<typename T> void g(T t) noexcept(sizeof(T) == 1) {} diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp index 033ae34..70a96be 100644 --- a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp +++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp @@ -43,11 +43,10 @@ namespace std_example { requires sizeof(a) == 4; // OK requires a == 0; // expected-error{{substitution into constraint expression resulted in a non-constant expression}} // expected-note@-1{{while checking the satisfaction of nested requirement requested here}} - // expected-note@-2{{in instantiation of requirement here}} - // expected-note@-3{{while checking the satisfaction of nested requirement requested here}} - // expected-note@-6{{while substituting template arguments into constraint expression here}} - // expected-note@-5{{function parameter 'a' with unknown value cannot be used in a constant expression}} - // expected-note@-8{{declared here}} + // expected-note@-2{{while checking the satisfaction of nested requirement requested here}} + // expected-note@-5{{while substituting template arguments into constraint expression here}} + // expected-note@-4{{function parameter 'a' with unknown value cannot be used in a constant expression}} + // expected-note@-7{{declared here}} }; static_assert(C2<int>); // expected-error{{static assertion failed}} // expected-note@-1{{while checking the satisfaction of concept 'C2<int>' requested here}} @@ -84,31 +83,26 @@ static_assert(Pipes<S>); static_assert(Pipes<double>); static_assert(Amps1<S>); -static_assert(!Amps1<double>); +static_assert(Amps1<double>); static_assert(Amps2<S>); -static_assert(!Amps2<double>); +static_assert(Amps2<double>); template<class T> -void foo1() requires requires (T x) { // #foo1 +void foo1() requires requires (T x) { requires - True<decltype(x.value)> // #foo1Value + True<decltype(x.value)> && True<T>; } {} template<class T> void fooPipes() requires Pipes<T> {} -template<class T> void fooAmps1() requires Amps1<T> {} // #fooAmps1 +template<class T> void fooAmps1() requires Amps1<T> {} void foo() { foo1<S>(); - foo1<int>(); // expected-error {{no matching function for call to 'foo1'}} - // expected-note@#foo1Value {{because 'True<decltype(x.value)> && True<T>' would be invalid: member reference base type 'int' is not a structure or union}} - // expected-note@#foo1 {{candidate template ignored: constraints not satisfied [with T = int]}} + foo1<int>(); fooPipes<S>(); fooPipes<int>(); fooAmps1<S>(); - fooAmps1<int>(); // expected-error {{no matching function for call to 'fooAmps1'}} - // expected-note@#fooAmps1 {{candidate template ignored: constraints not satisfied [with T = int]}} - // expected-note@#fooAmps1 {{because 'int' does not satisfy 'Amps1'}} - // expected-note@#Amps1 {{because 'True<decltype(x.value)> && True<T> && !False<T>' would be invalid: member reference base type 'int' is not a structure or union}} + fooAmps1<int>(); } template<class T> @@ -158,15 +152,16 @@ void func() { // expected-note@#bar {{while substituting template arguments into constraint expression here}} // expected-note@#bar {{while checking the satisfaction of nested requirement requested here}} // expected-note@#bar {{candidate template ignored: constraints not satisfied [with T = False]}} - // expected-note@#bar {{because 'X<SubstitutionFailureNestedRequires::ErrorExpressions_NotSF::False>::value' evaluated to false}} + // expected-note@#bar {{because 'X<False>::value' evaluated to false}} bar<int>(); + // expected-error@-1 {{no matching function for call to 'bar'}} \ // expected-note@-1 {{while checking constraint satisfaction for template 'bar<int>' required here}} \ - // expected-note@-1 {{while substituting deduced template arguments into function template 'bar' [with T = int]}} + // expected-note@-1 {{while substituting deduced template arguments into function template 'bar' [with T = int]}} \ // expected-note@#bar {{in instantiation of static data member}} - // expected-note@#bar {{in instantiation of requirement here}} // expected-note@#bar {{while checking the satisfaction of nested requirement requested here}} // expected-note@#bar {{while substituting template arguments into constraint expression here}} + // expected-note@#bar {{candidate template ignored}} // expected-error@#X_Value {{type 'int' cannot be used prior to '::' because it has no members}} } } diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp index 5199708..5dcb188 100644 --- a/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp +++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp @@ -39,14 +39,14 @@ using r2i4 = r2<const D>; // expected-error{{constraints not satisfied for class template<typename T> requires requires { sizeof(T); } // expected-note@-1{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}} -// expected-note@-2{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'nonexistent'}} +// expected-note@-2{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'class nonexistent'}} struct r3 {}; using r3i1 = r3<int>; using r3i2 = r3<A>; using r3i3 = r3<A &>; using r3i4 = r3<void>; // expected-error{{constraints not satisfied for class template 'r3' [with T = void]}} -using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = nonexistent]}} +using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = class nonexistent]}} template<typename T> requires requires (T t) { 0; "a"; (void)'a'; } struct r4 {}; diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp index 5433cfb..28dff33 100644 --- a/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp +++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp @@ -182,14 +182,14 @@ namespace std_example { static_assert(C1<has_inner_and_type> && C2<has_inner_and_type> && C3<has_inner_and_type>); template<C1 T> struct C1_check {}; // expected-note@-1 {{because 'int' does not satisfy 'C1'}} - // expected-note@-2 {{because 'std_example::has_type' does not satisfy 'C1'}} + // expected-note@-2 {{because 'has_type' does not satisfy 'C1'}} template<C2 T> struct C2_check {}; - // expected-note@-1 {{because 'std_example::has_inner' does not satisfy 'C2'}} + // expected-note@-1 {{because 'has_inner' does not satisfy 'C2'}} template<C3 T> struct C3_check {}; // expected-note@-1 {{because 'void' does not satisfy 'C3'}} using c1 = C1_check<int>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = int]}} - using c2 = C1_check<has_type>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = std_example::has_type]}} - using c3 = C2_check<has_inner>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = std_example::has_inner]}} + using c2 = C1_check<has_type>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = has_type]}} + using c3 = C2_check<has_inner>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = has_inner]}} using c4 = C3_check<void>; // expected-error{{constraints not satisfied for class template 'C3_check' [with T = void]}} } @@ -199,10 +199,10 @@ template <typename T> concept C = requires { requires requires { T::a; }; }; // expected-note@-1 {{because 'T::a' would be invalid: no member named 'a' in 'PR48656::T1'}} template <C...> struct A {}; -// expected-note@-1 {{because 'PR48656::T1' does not satisfy 'C'}} +// expected-note@-1 {{because 'T1' does not satisfy 'C'}} struct T1 {}; -template struct A<T1>; // expected-error {{constraints not satisfied for class template 'A' [with $0 = <PR48656::T1>]}} +template struct A<T1>; // expected-error {{constraints not satisfied for class template 'A' [with $0 = <T1>]}} struct T2 { static constexpr bool a = false; }; template struct A<T2>; diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp index 59e6a48..6dea0c6 100644 --- a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp +++ b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp @@ -28,9 +28,8 @@ template<typename T> requires requires { requires S<T>{}; // expected-error@-1{{atomic constraint must be of type 'bool' (found 'S<int>')}} // expected-note@-2{{while checking the satisfaction}} - // expected-note@-3{{in instantiation of requirement}} - // expected-note@-4{{while checking the satisfaction}} - // expected-note@-6{{while substituting template arguments}} + // expected-note@-3{{while checking the satisfaction of nested requirement}} + // expected-note@-5{{while substituting template arguments}} // expected-note@#F3INST{{while checking constraint satisfaction}} // expected-note@#F3INST{{while substituting deduced template arguments into function template 'f3' [with T = int]}} // diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp index 3992835..34c5c5d 100644 --- a/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp +++ b/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp @@ -1,21 +1,31 @@ // RUN: %clang_cc1 -std=c++2a -x c++ -verify %s +// RUN: %clang_cc1 -std=c++2c -x c++ -verify %s template<typename T> concept True = true; -template<typename T> concept Foo = True<T*>; -template<typename T> concept Bar = Foo<T&>; -template<typename T> requires Bar<T> struct S { }; -template<typename T> requires Bar<T> && true struct S<T> { }; +template<typename T> concept Foo = True<T*>; // #Foo +template<typename T> concept Bar = Foo<T&>; // #Bar +template<typename T> requires Bar<T> struct S { }; // #S +template<typename T> requires Bar<T> && true struct S<T> { }; // #SpecS +// expected-error@-1 {{class template partial specialization is not more specialized than the primary template}} +// expected-error@#Foo 2{{'type name' declared as a pointer to a reference of type 'T &'}} +// expected-note@#SpecS {{while substituting into concept arguments here}} +// expected-note@#S {{while substituting into concept arguments here}} +// expected-note@#Bar 2{{while substituting into concept arguments here}} +// expected-note@#S {{template is declared here}} + + template<typename T> concept True2 = sizeof(T) >= 0; -template<typename T> concept Foo2 = True2<T*>; -// expected-error@-1{{'type name' declared as a pointer to a reference of type 'type-parameter-0-0 &'}} -template<typename T> concept Bar2 = Foo2<T&>; -// expected-note@-1{{while substituting into concept arguments here; substitution failures not allowed in concept arguments}} -template<typename T> requires Bar2<T> struct S2 { }; +template<typename T> concept Foo2 = True2<T*>; // #Foo2 + +template<typename T> concept Bar2 = Foo2<T&>; // #Bar2 +// expected-note@-1 3{{while substituting into concept arguments here; substitution failures not allowed in concept arguments}} +template<typename T> requires Bar2<T> struct S2 { }; // #SpecS2_1 // expected-note@-1{{template is declared here}} -template<typename T> requires Bar2<T> && true struct S2<T> { }; +template<typename T> requires Bar2<T> && true struct S2<T> { }; // #SpecS2_2 // expected-error@-1{{class template partial specialization is not more specialized than the primary template}} -// expected-note@-2{{while calculating associated constraint of template 'S2<T>' here}} +// expected-error@#Foo2{{'type name' declared as a pointer to a reference of type 'T &'}} + namespace type_pack { template<typename... Args> @@ -71,16 +81,31 @@ namespace non_type_pack { namespace PR47174 { // This checks that we don't crash with a failed substitution on the first constrained argument when // performing normalization. -template <Bar2 T, True U> +template <Bar2 T, True U> // #S3_Header requires true struct S3; // expected-note {{template is declared here}} template <True T, True U> -requires true struct S3<T, U>; // expected-error {{class template partial specialization is not more specialized than the primary template}} +requires true struct S3<T, U>; +// expected-error@-1 {{class template partial specialization is not more specialized than the primary template}} +// expected-error@#Foo2 2{{'type name' declared as a pointer to a reference of type 'T &'}} +// expected-note@#SpecS2_1 {{while substituting into concept arguments here}} +// expected-note@#SpecS2_2 {{while substituting into concept arguments here}} +// expected-note@#S3_Header {{while substituting into concept arguments here}} +// expected-note@#Bar2 {{while substituting into concept arguments here}} + // Same as above, for the second position (but this was already working). -template <True T, Bar2 U> -requires true struct S4; // expected-note {{template is declared here}} +template <True T, Bar2 U> // #S4_Header +requires true struct S4; // #S4 template <True T, True U> -requires true struct S4<T, U>; // expected-error {{class template partial specialization is not more specialized than the primary template}} +requires true struct S4<T, U>; // #S4-spec +// expected-error@-1 {{class template partial specialization is not more specialized than the primary template}} +// expected-error@#Foo2 {{'type name' declared as a pointer to a reference of type 'U &'}} +// expected-note@#S4_Header {{while substituting into concept arguments here}} +// expected-note@#S4 {{template is declared here}} +// expected-note@#S4 {{similar constraint expressions not considered equivalent}} +// expected-note@#S4-spec {{similar constraint expression here}} + + struct X { template<int> struct Y { @@ -96,7 +121,7 @@ template<class T> requires C1<T> && C2<T> void t1() = delete; // expected-note { template void t1<X>(); void t1() { t1<X>(); } // expected-error {{call to deleted function 't1'}} -template<class T> requires C1<T> void t2() {}; // expected-note 2 {{candidate function}} +template<class T> requires C1<T> void t2() {}; // expected-note 2 {{candidate function}} template<class T> requires C2<T> void t2() {}; // expected-note 2 {{candidate function}} template void t2<X>(); // expected-error {{partial ordering for explicit instantiation of 't2' is ambiguous}} void t2() { t2<X>(); } // expected-error {{call to 't2' is ambiguous}} diff --git a/clang/test/CXX/temp/temp.param/p10-2a.cpp b/clang/test/CXX/temp/temp.param/p10-2a.cpp index 4f5fdd3..c0406f8 100644 --- a/clang/test/CXX/temp/temp.param/p10-2a.cpp +++ b/clang/test/CXX/temp/temp.param/p10-2a.cpp @@ -86,16 +86,18 @@ using f1 = F<int>; using f2 = F<long>; // expected-error {{constraints not satisfied for alias template 'F' [with T = long]}} template<typename T, typename... Ts> -concept OneOf = (is_same_v<T, Ts> || ...); -// expected-note@-1 2{{because 'is_same_v<char, char[1]>' evaluated to false}} -// expected-note@-2 2{{and 'is_same_v<char, char[2]>' evaluated to false}} -// expected-note@-3 {{because 'is_same_v<short, int>' evaluated to false}} -// expected-note@-4 {{and 'is_same_v<short, long>' evaluated to false}} -// expected-note@-5 {{and 'is_same_v<short, char>' evaluated to false}} -// expected-note@-6 3{{because 'is_same_v<int, char[1]>' evaluated to false}} -// expected-note@-7 3{{and 'is_same_v<int, char[2]>' evaluated to false}} -// expected-note@-8 2{{because 'is_same_v<std::nullptr_t, char>' evaluated to false}} -// expected-note@-9 2{{and 'is_same_v<std::nullptr_t, int>' evaluated to false}} +concept OneOf = (is_same_v<T, Ts> || ...); // #OneOf +// expected-note@#OneOf 2{{because 'is_same_v<char, char[1]>' evaluated to false}} +// expected-note@#OneOf 2{{and 'is_same_v<char, char[2]>' evaluated to false}} +// expected-note@#OneOf {{because 'is_same_v<short, int>' evaluated to false}} +// expected-note@#OneOf {{and 'is_same_v<short, long>' evaluated to false}} +// expected-note@#OneOf {{and 'is_same_v<short, char>' evaluated to false}} +// expected-note@#OneOf 3{{because 'is_same_v<int, char[1]>' evaluated to false}} +// expected-note@#OneOf 3{{and 'is_same_v<int, char[2]>' evaluated to false}} +// expected-note@#OneOf {{because 'is_same_v<decltype(nullptr), char>' evaluated to false}} +// expected-note@#OneOf {{because 'is_same_v<std::nullptr_t, char>' evaluated to false}} +// expected-note@#OneOf {{and 'is_same_v<std::nullptr_t, int>' evaluated to false}} +// expected-note@#OneOf {{and 'is_same_v<decltype(nullptr), int>' evaluated to false}} template<OneOf<char[1], char[2]> T, OneOf<int, long, char> U> // expected-note@-1 2{{because 'OneOf<char, char[1], char[2]>' evaluated to false}} @@ -124,6 +126,7 @@ using I = int; using i1 = I<1>; using i2 = I<'a'>; +// FIXME: This crashes with -std=c++2c using i3 = I<nullptr>; // expected-error@-1 {{constraints not satisfied for alias template 'I' [with x = nullptr]}} diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp index fd1a5c0..404b928 100644 --- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp +++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp @@ -587,6 +587,23 @@ static_assert(__is_same(decltype(a), A<A<int>>)); } // namespace GH133132 +namespace GH131408 { + +struct Node {}; + +template <class T, Node> +struct A { + A(T) {} +}; + +template <class T> +using AA = A<T, {}>; + +AA a{0}; + +static_assert(__is_same(decltype(a), A<int, Node{}>)); +} + namespace GH130604 { template <typename T> struct A { A(T); diff --git a/clang/test/SemaCXX/cxx23-assume.cpp b/clang/test/SemaCXX/cxx23-assume.cpp index 99a82d9..ce86266 100644 --- a/clang/test/SemaCXX/cxx23-assume.cpp +++ b/clang/test/SemaCXX/cxx23-assume.cpp @@ -127,13 +127,12 @@ struct F { template <typename T> constexpr int f5() requires C<T> { return 1; } // expected-note {{while checking the satisfaction}} - // expected-note@-1 {{while substituting template arguments}} - // expected-note@-2 {{candidate template ignored}} + // expected-note@-1 {{candidate template ignored}} template <typename T> -constexpr int f5() requires (!C<T>) { return 2; } // expected-note 4 {{while checking the satisfaction}} - // expected-note@-1 4 {{while substituting template arguments}} - // expected-note@-2 {{candidate template ignored}} +constexpr int f5() requires (!C<T>) { return 2; } // expected-note 4 {{while checking the satisfaction}} \ + // expected-note 4 {{while substituting template arguments}} \ + // expected-note {{candidate template ignored}} static_assert(f5<int>() == 1); static_assert(f5<D>() == 1); // expected-note 3 {{while checking constraint satisfaction}} diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp index 74b3573..6777dc2 100644 --- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp +++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp @@ -1257,13 +1257,13 @@ void f() { (&A::e)(a, a); // expected-error@-1 {{no matching function for call to 'e'}} \ // expected-note@#tpl-address-e{{candidate template ignored: constraints not satisfied [with T = A, U = A]}} \ - // expected-note@#tpl-address-e{{because '__is_same(tpl_address::A, int)' evaluated to false}} + // expected-note@#tpl-address-e{{because '__is_same(A, int)' evaluated to false}} (&A::e<A>)(a, 0); (&A::e<A>)(a, a); // expected-error@-1 {{no matching function for call to 'e'}} \ // expected-note@#tpl-address-e{{candidate template ignored: constraints not satisfied [with T = A, U = A]}} \ - // expected-note@#tpl-address-e{{because '__is_same(tpl_address::A, int)' evaluated to false}} + // expected-note@#tpl-address-e{{because '__is_same(A, int)' evaluated to false}} (&A::e<A, int>)(a, 0); @@ -1273,12 +1273,12 @@ void f() { (&A::f<A>)(a); // expected-error@-1 {{no matching function for call to 'f'}} \ // expected-note@#tpl-address-f{{candidate template ignored: constraints not satisfied [with T = A]}} \ - // expected-note@#tpl-address-f{{because '__is_same(tpl_address::A, int)' evaluated to false}} + // expected-note@#tpl-address-f{{because '__is_same(A, int)' evaluated to false}} (&A::f)(a); // expected-error@-1 {{no matching function for call to 'f'}} \ // expected-note@#tpl-address-f{{candidate template ignored: constraints not satisfied [with T = A]}} \ - // expected-note@#tpl-address-f{{because '__is_same(tpl_address::A, int)' evaluated to false}} + // expected-note@#tpl-address-f{{because '__is_same(A, int)' evaluated to false}} (&A::g)(a); (&A::g)(a, 0); diff --git a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp index 4220486..137f46e 100644 --- a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp +++ b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -std=c++2c -verify %s -template <class T> concept A = true; -template <class T> concept C = A<T> && true; +template <class T> concept A = (T(), true); +template <class T> concept C = A<T> && true; // #C template <class T> concept D = A<T> && __is_same(T, int); @@ -40,13 +40,23 @@ constexpr int i(T...) { return 1; }; // expected-note {{candidate}} static_assert(i(0) == 1); // expected-error {{call to 'i' is ambiguous}} -template <class... T> requires (A<T> || ... || true) -constexpr int j(T...) { return 0; }; -template <class... T> requires (C<T> && ... && true) -constexpr int j(T...) { return 1; }; +template <class... T> requires (A<T> || ... || true) constexpr int j(T...) { return 0; }; // #j1 +template <class... T> requires (C<T> && ... && true) constexpr int j(T...) { return 1; }; // #j2 static_assert(j(0) == 1); +// expected-error@-1 {{call to 'j' is ambiguous}} +// expected-note@#j1 {{candidate function [with T = <int>]}} +// expected-note@#j2 {{candidate function [with T = <int>]}} +// expected-note@#j2 {{imilar constraint expressions not considered equivalent}} +// expected-note@#j1 {{similar constraint expression here}} + + static_assert(j() == 1); +// expected-error@-1 {{call to 'j' is ambiguous}} +// expected-note@#j1 {{candidate function [with T = <>]}} +// expected-note@#j2 {{candidate function [with T = <>]}} +// expected-note@#j2 {{imilar constraint expressions not considered equivalent}} +// expected-note@#j1 {{similar constraint expression here}} @@ -107,7 +117,7 @@ void test() { } namespace substitution { - struct S { +struct S { using type = int; }; @@ -144,51 +154,69 @@ consteval int Or3() requires (C<typename T::type> || ... || C<typename U::type>) static_assert(And1<>() == 1); static_assert(And1<S>() == 1); static_assert(And1<S, S>() == 1); +// FIXME: The diagnostics are not so great static_assert(And1<int>() == 1); // expected-error {{no matching function for call to 'And1'}} - // expected-note@#and1 {{candidate template ignored: constraints not satisfied}} - // expected-note@#and1 {{because substituted constraint expression is ill-formed}} + // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <int>]}} + // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}} + // expected-note@#C {{because 'T' does not satisfy 'A'}} static_assert(And1<S, int>() == 1); // expected-error {{no matching function for call to 'And1'}} - // expected-note@#and1 {{candidate template ignored: constraints not satisfied}} - // expected-note@#and1 {{because substituted constraint expression is ill-formed}} + // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <S, int>]}} + // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}} + // expected-note@#C {{because 'T' does not satisfy 'A'}} static_assert(And1<int, S>() == 1); // expected-error {{no matching function for call to 'And1'}} - // expected-note@#and1 {{candidate template ignored: constraints not satisfied}} - // expected-note@#and1 {{because substituted constraint expression is ill-formed}} + // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <int, S>]}} + // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}} + // expected-note@#C {{because 'T' does not satisfy 'A'}} static_assert(And2<S>() == 2); static_assert(And2<S, S>() == 2); -static_assert(And2<int>() == 2); +static_assert(And2<int>() == 2); // expected-error {{no matching function for call to 'And2'}} + // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}} + // expected-note@#and2 {{because 'typename U::type' does not satisfy 'C'}} + // expected-note@#C {{because 'T' does not satisfy 'A'}} + static_assert(And2<int, int>() == 2); // expected-error {{no matching function for call to 'And2'}} - // expected-note@#and2 {{candidate template ignored: constraints not satisfied}} - // expected-note@#and2 {{because substituted constraint expression is ill-formed}} + // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = S, U = <int>]}} \ + // expected-note@#and2 {{because 'typename U::type' does not satisfy 'C'}} + // expected-note@#C {{because 'T' does not satisfy 'A'}} static_assert(And2<S, int>() == 2); // expected-error {{no matching function for call to 'And2'}} - // expected-note@#and2 {{candidate template ignored: constraints not satisfied}} - // expected-note@#and2 {{because substituted constraint expression is ill-formed}} + // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <S>]}} + // expected-note@#and2 {{because 'typename T::type' does not satisfy 'C'}} + // expected-note@#C {{because 'T' does not satisfy 'A'}} static_assert(And2<int, S>() == 2); // expected-error {{no matching function for call to 'And2'}} - // expected-note@#and2 {{candidate template ignored: constraints not satisfied}} - // expected-note@#and2 {{because substituted constraint expression is ill-formed}} + // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <int>]}} + // expected-note@#and2 {{because 'typename T::type' does not satisfy 'C'}} + // expected-note@#C {{because 'T' does not satisfy 'A'}} static_assert(And3<S>() == 3); static_assert(And3<S, S>() == 3); static_assert(And3<int>() == 3); // expected-error {{no matching function for call to 'And3'}} - // expected-note@#and3 {{candidate template ignored: constraints not satisfied}} - // expected-note@#and3 {{because substituted constraint expression is ill-formed}} + // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}} + // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}} + // expected-note@#C {{because 'T' does not satisfy 'A'}} + static_assert(And3<int, int>() == 3); // expected-error {{no matching function for call to 'And3'}} - // expected-note@#and3 {{candidate template ignored: constraints not satisfied}} - // expected-note@#and3 {{because substituted constraint expression is ill-formed}} + // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <int>]}} + // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}} + // expected-note@#C {{because 'T' does not satisfy 'A'}} + static_assert(And3<S, int>() == 3); // expected-error {{no matching function for call to 'And3'}} - // expected-note@#and3 {{candidate template ignored: constraints not satisfied}} - // expected-note@#and3 {{because substituted constraint expression is ill-formed}} + // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = S, U = <int>]}} + // expected-note@#and3 {{because 'typename U::type' does not satisfy 'C'}} + // expected-note@#C {{because 'T' does not satisfy 'A'}} + static_assert(And3<int, S>() == 3); // expected-error {{no matching function for call to 'And3'}} - // expected-note@#and3 {{candidate template ignored: constraints not satisfied}} - // expected-note@#and3 {{because substituted constraint expression is ill-formed}} + // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <S>]}} + // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}} + // expected-note@#C {{because 'T' does not satisfy 'A'}} static_assert(Or1<>() == 1); // expected-error {{no matching function for call to 'Or1'}} @@ -198,25 +226,26 @@ static_assert(Or1<int, S>() == 1); static_assert(Or1<S, int>() == 1); static_assert(Or1<S, S>() == 1); static_assert(Or1<int>() == 1); // expected-error {{no matching function for call to 'Or1'}} - // expected-note@#or1 {{candidate template ignored: constraints not satisfied}} \ - // expected-note@#or1 {{because substituted constraint expression is ill-formed}} - + // expected-note@#or1 {{candidate template ignored: constraints not satisfied}} + // expected-note@#or1 {{because 'typename T::type' does not satisfy 'C'}} + // expected-note@#C {{because 'T' does not satisfy 'A'}} static_assert(Or2<S>() == 2); static_assert(Or2<int, S>() == 2); static_assert(Or2<S, int>() == 2); static_assert(Or2<S, S>() == 2); static_assert(Or2<int>() == 2); // expected-error {{no matching function for call to 'Or2'}} - // expected-note@#or2 {{candidate template ignored: constraints not satisfied}} \ - // expected-note@#or2 {{because substituted constraint expression is ill-formed}} - + // expected-note@#or2 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}} + // expected-note@#or2 {{because 'typename T::type' does not satisfy 'C'}} + // expected-note@#C {{because 'T' does not satisfy 'A'}} static_assert(Or3<S>() == 3); static_assert(Or3<int, S>() == 3); static_assert(Or3<S, int>() == 3); static_assert(Or3<S, S>() == 3); static_assert(Or3<int>() == 3); // expected-error {{no matching function for call to 'Or3'}} - // expected-note@#or3 {{candidate template ignored: constraints not satisfied}} \ - // expected-note@#or3 {{because substituted constraint expression is ill-formed}} + // expected-note@#or3 {{candidate template ignored: constraints not satisfied}} + // expected-note@#or3 {{because 'typename T::type' does not satisfy 'C'}} + // expected-note@#C {{because 'T' does not satisfy 'A'}} } namespace bool_conversion_break { @@ -226,7 +255,7 @@ struct Thingy { static constexpr int compare(const Thingy&) {return 1;} }; template <typename ...T, typename ...U> -void f(A<T ...> *, A<U ...> *) // expected-note {{candidate template ignored: failed template argument deduction}} +void f(A<T ...> *, A<U ...> *) // expected-note {{candidate template ignored: constraints not satisfied}} requires (T::compare(U{}) && ...); // expected-error {{atomic constraint must be of type 'bool' (found 'int')}} void g() { @@ -269,9 +298,7 @@ struct S { static_assert(S<int>::f<int>() == 2); -static_assert(S<int>::g<int>() == 2); // expected-error {{call to 'g' is ambiguous}} - // expected-note@#nested-ambiguous-g1 {{candidate}} - // expected-note@#nested-ambiguous-g2 {{candidate}} +static_assert(S<int>::g<int>() == 2); } @@ -384,3 +411,98 @@ struct LazyLitMatrix<index_by<Indices...>, init> { } } + +namespace GH135190 { +template <typename T> +concept A = __is_same_as(T, int) || __is_same_as(T, double) ; + +template <typename T> +concept B = A<T> && __is_same_as(T, double); + +template <class... Ts> +requires(A<Ts> && ...) +constexpr int g() { + return 1; +} + +template <class... Ts> +requires(B<Ts> && ...) +constexpr int g() { + return 2; +} + +static_assert(g<double>() == 2); + + +template <class... Ts> +concept all_A = (A<Ts> && ...); + +template <class... Ts> +concept all_B = (B<Ts> && ...); + +template <class... Ts> +requires all_A<Ts...> +constexpr int h() { + return 1; +} + +template <class... Ts> +requires all_B<Ts...> +constexpr int h() { + return 2; +} + +static_assert(h<double>() == 2); +} + + +namespace parameter_mapping_regressions { + +namespace case1 { +namespace std { +template <class _Tp, class... _Args> +constexpr bool is_constructible_v = __is_constructible(_Tp, _Args...); +template <class _Tp, class... _Args> +concept constructible_from = is_constructible_v<_Tp, _Args...>; +template <class _Tp> +concept default_initializable = true; +template <class> using iterator_t = int; +template <class _Tp> +concept view = constructible_from<_Tp, _Tp>; +template <class... _Views> + requires(view<_Views> && ...) +class zip_transform_view; +} // namespace std +struct IterDefaultCtrView {}; +template <class... Views> +using Iter = std::iterator_t<std::zip_transform_view<Views...>>; +static_assert( + std::default_initializable<Iter<IterDefaultCtrView, IterDefaultCtrView>>); + +} + +namespace case2 { + +template <class _Bp> +constexpr bool False = false; + +template <class... _Views> +concept __zip_all_random_access = (False<_Views> && ...); +// expected-note@-1 {{evaluated to false}} + +template <typename... _Views> +struct zip_view { + void f() requires __zip_all_random_access<_Views...>{}; + // expected-note@-1 {{because 'int' does not satisfy}} +}; + +zip_view<int> test_v; +static_assert(!__zip_all_random_access<int>); + +void test() { + test_v.f(); // expected-error {{invalid reference to function 'f'}} +} + +} + +} diff --git a/clang/test/SemaCXX/cxx2c-template-template-param.cpp b/clang/test/SemaCXX/cxx2c-template-template-param.cpp index ed55a059..4ad3fd9 100644 --- a/clang/test/SemaCXX/cxx2c-template-template-param.cpp +++ b/clang/test/SemaCXX/cxx2c-template-template-param.cpp @@ -106,7 +106,7 @@ concept BinaryDefaultedFalse = false; template <template <typename...> concept C, typename T> struct S { - template <C TT> // expected-note {{because 'int' does not satisfy 'UnaryFalse'}} + template <C TT> // expected-note 2{{because 'int' does not satisfy 'UnaryFalse'}} void f(TT); // expected-note {{ignored}} void g(C auto); // expected-note {{ignored}} \ // expected-note {{because 'int' does not satisfy 'UnaryFalse'}} @@ -171,7 +171,7 @@ concept BinaryDefaultedFalse = false; template <template <typename...> concept C, typename T> struct S { - template <C TT> // expected-note {{because 'int' does not satisfy 'UnaryFalse'}} + template <C TT> // expected-note 2{{because 'int' does not satisfy 'UnaryFalse'}} void f(TT); // expected-note {{ignored}} void g(C auto); // expected-note {{ignored}} \ // expected-note {{because 'int' does not satisfy 'UnaryFalse'}} diff --git a/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp b/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp index 436dfb9..8400340 100644 --- a/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp +++ b/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp @@ -1,6 +1,6 @@ // RUN: %clang -fsyntax-only -std=c++2a -Xclang -verify -ftemplate-depth=5 -ftemplate-backtrace-limit=4 %s -// RequiresExpr contains invalid requirement. (Eg. Highly recurisive template). +// RequiresExpr contains invalid requirement. (Eg. Highly recursive template). template<int x> struct A { static constexpr bool far(); }; class B { @@ -19,7 +19,7 @@ constexpr bool A<x>::far() { // expected-error@#Invalid {{recursive template instantiation exceeded maximum depth}} // expected-note@#Invalid 3 {{while}} // expected-note@#Invalid {{contexts in backtrace}} - // expected-note@#Invalid {{increase recursive template instantiation depth}} + // expected-note@#Invalid {{use -ftemplate-depth=N to increase}} }; } static_assert(A<1>::far()); diff --git a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp index 135865c..c3bda39 100644 --- a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp +++ b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp @@ -102,7 +102,7 @@ static_assert(__is_constructible(Movable, int)); // expected-error@-1 {{no matching constructor for initialization of 'Movable'}} \ // expected-note@-1 2{{}} // expected-error@#err-self-constraint-1{{satisfaction of constraint '__is_constructible(Movable, T)' depends on itself}} -// expected-note@#err-self-constraint-1 4{{}} +// expected-note@#err-self-constraint-1 3{{}} // expected-note@#Movable {{'Movable' defined here}} template <typename T> @@ -200,7 +200,6 @@ void h(short n) { f(n); } // expected-note@-1{{while checking constraint satisfaction for template}} // expected-note@#GH62096-note1{{in instantiation}} // expected-note@#GH62096-note1{{while substituting template arguments into constraint expression here}} -// expected-note@#GH62096-note2{{while substituting template arguments into constraint expression here}} // expected-note@#GH62096-note2{{while checking the satisfaction of concept}} // expected-note@#GH62096-err {{expression evaluates}} } diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp index d49330f..901d510 100644 --- a/clang/test/SemaCXX/type-traits.cpp +++ b/clang/test/SemaCXX/type-traits.cpp @@ -5129,12 +5129,12 @@ namespace GH121278 { #if __cplusplus >= 202002L template <typename B, typename D> concept C = __is_base_of(B, D); -// expected-error@-1 {{incomplete type 'GH121278::S' used in type trait expression}} +// expected-error@-1 {{incomplete type 'S' used in type trait expression}} // expected-note@-2 {{while substituting template arguments into constraint expression here}} struct T; struct S; bool b = C<T, S>; -// expected-note@-1 {{while checking the satisfaction of concept 'C<GH121278::T, GH121278::S>' requested here}} +// expected-note@-1 {{while checking the satisfaction of concept 'C<T, S>' requested here}} #endif } diff --git a/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl b/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl index d7c6876..999372c 100644 --- a/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl @@ -19,7 +19,7 @@ Buffer<double2> r4; // expected-error@+4 {{constraints not satisfied for class template 'Buffer'}} // expected-note@*:* {{template declaration from hidden source: template <typename element_type> requires __is_typed_resource_element_compatible<element_type> class Buffer}} -// expected-note@*:* {{because 'hlsl::Buffer<int>' does not satisfy '__is_typed_resource_element_compatible'}} +// expected-note@*:* {{because 'Buffer<int>' does not satisfy '__is_typed_resource_element_compatible'}} // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(hlsl::Buffer<int>)' evaluated to false}} Buffer<Buffer<int> > r5; @@ -65,7 +65,7 @@ Buffer<half[4]> r10; typedef vector<int, 8> int8; // expected-error@+3 {{constraints not satisfied for class template 'Buffer'}} -// expected-note@*:* {{because 'vector<int, 8>' (vector of 8 'int' values) does not satisfy '__is_typed_resource_element_compatible'}} +// expected-note@*:* {{because 'int8' (aka 'vector<int, 8>') does not satisfy '__is_typed_resource_element_compatible'}} // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<int, 8>)' evaluated to false}} Buffer<int8> r11; @@ -90,7 +90,7 @@ enum numbers { one, two, three }; Buffer<numbers> r15; // expected-error@+3 {{constraints not satisfied for class template 'Buffer'}} -// expected-note@*:* {{because 'vector<double, 3>' (vector of 3 'double' values) does not satisfy '__is_typed_resource_element_compatible'}} +// expected-note@*:* {{because 'double3' (aka 'vector<double, 3>') does not satisfy '__is_typed_resource_element_compatible'}} // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<double, 3>)' evaluated to false}} Buffer<double3> r16; diff --git a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl index 361f4303..b33f2af 100644 --- a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl @@ -19,7 +19,7 @@ RWBuffer<double2> r4; // expected-error@+4 {{constraints not satisfied for class template 'RWBuffer'}} // expected-note@*:* {{template declaration from hidden source: template <typename element_type> requires __is_typed_resource_element_compatible<element_type> class RWBuffer}} -// expected-note@*:* {{because 'hlsl::RWBuffer<int>' does not satisfy '__is_typed_resource_element_compatible'}} +// expected-note@*:* {{because 'RWBuffer<int>' does not satisfy '__is_typed_resource_element_compatible'}} // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(hlsl::RWBuffer<int>)' evaluated to false}} RWBuffer<RWBuffer<int> > r5; @@ -65,7 +65,7 @@ RWBuffer<half[4]> r10; typedef vector<int, 8> int8; // expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}} -// expected-note@*:* {{because 'vector<int, 8>' (vector of 8 'int' values) does not satisfy '__is_typed_resource_element_compatible'}} +// expected-note@*:* {{because 'int8' (aka 'vector<int, 8>') does not satisfy '__is_typed_resource_element_compatible'}} // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<int, 8>)' evaluated to false}} RWBuffer<int8> r11; @@ -90,7 +90,7 @@ enum numbers { one, two, three }; RWBuffer<numbers> r15; // expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}} -// expected-note@*:* {{because 'vector<double, 3>' (vector of 3 'double' values) does not satisfy '__is_typed_resource_element_compatible'}} +// expected-note@*:* {{because 'double3' (aka 'vector<double, 3>') does not satisfy '__is_typed_resource_element_compatible'}} // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<double, 3>)' evaluated to false}} RWBuffer<double3> r16; diff --git a/clang/test/SemaTemplate/GH161657.cpp b/clang/test/SemaTemplate/GH161657.cpp index 6ec7931..5ad4dde 100644 --- a/clang/test/SemaTemplate/GH161657.cpp +++ b/clang/test/SemaTemplate/GH161657.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsyntax-only -std=c++20 -ffp-exception-behavior=strict -verify %s +// RUN: %clang_cc1 -triple=x86_64 -fsyntax-only -std=c++20 -ffp-exception-behavior=strict -verify %s // expected-no-diagnostics template <class T> struct S { diff --git a/clang/test/SemaTemplate/concepts-recovery-expr.cpp b/clang/test/SemaTemplate/concepts-recovery-expr.cpp index 6bed179..aa4ed53 100644 --- a/clang/test/SemaTemplate/concepts-recovery-expr.cpp +++ b/clang/test/SemaTemplate/concepts-recovery-expr.cpp @@ -4,7 +4,7 @@ constexpr bool CausesRecoveryExpr = "test" + 1.0f; template<typename T> -concept ReferencesCRE = CausesRecoveryExpr; +concept ReferencesCRE = CausesRecoveryExpr; // #subst1 template<typename T> requires CausesRecoveryExpr // #NVC1REQ void NoViableCands1(){} // #NVC1 @@ -19,16 +19,18 @@ void NVCUse() { NoViableCands1<int>(); // expected-error@-1 {{no matching function for call to 'NoViableCands1'}} // expected-note@#NVC1{{candidate template ignored: constraints not satisfied}} + // expected-note@#NVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}} // expected-note@#NVC1REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} NoViableCands2<int>(); // expected-error@-1 {{no matching function for call to 'NoViableCands2'}} // expected-note@#NVC2{{candidate template ignored: constraints not satisfied}} - // expected-note@#NVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} + // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} NoViableCands3<int>(); // expected-error@-1 {{no matching function for call to 'NoViableCands3'}} // expected-note@#NVC3{{candidate template ignored: constraints not satisfied}} - // expected-note@#NVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} + // expected-note@#NVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}} + // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} } template<typename T> requires CausesRecoveryExpr // #OVC1REQ @@ -58,12 +60,14 @@ void OVCUse() { // expected-error@-1 {{no matching function for call to 'OtherViableCands2'}} // expected-note@#OVC2_ALT {{candidate function}} // expected-note@#OVC2 {{candidate template ignored: constraints not satisfied}} - // expected-note@#OVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} + // expected-note@#OVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}} + // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} OtherViableCands3<int>(); // expected-error@-1 {{no matching function for call to 'OtherViableCands3'}} // expected-note@#OVC3_ALT {{candidate function}} // expected-note@#OVC3 {{candidate template ignored: constraints not satisfied}} - // expected-note@#OVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} + // expected-note@#OVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}} + // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} } template<typename T> requires CausesRecoveryExpr // #OBNVC1REQ @@ -95,13 +99,15 @@ void OBNVCUse() { // expected-note@#OBNVC2_ALT {{candidate template ignored: constraints not satisfied}} // expected-note@#OBNVC2REQ_ALT {{because 'false' evaluated to false}} // expected-note@#OBNVC2 {{candidate template ignored: constraints not satisfied}} - // expected-note@#OBNVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} + // expected-note@#OBNVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}} + // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} OtherBadNoViableCands3<int>(); // expected-error@-1 {{no matching function for call to 'OtherBadNoViableCands3'}} // expected-note@#OBNVC3_ALT {{candidate template ignored: constraints not satisfied}} // expected-note@#OBNVC3REQ_ALT {{because 'false' evaluated to false}} // expected-note@#OBNVC3 {{candidate template ignored: constraints not satisfied}} - // expected-note@#OBNVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} + // expected-note@#OBNVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}} + // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} } @@ -136,12 +142,14 @@ void MemOVCUse() { // expected-error@-1 {{no matching member function for call to 'OtherViableCands2'}} // expected-note@#MEMOVC2_ALT {{candidate function}} // expected-note@#MEMOVC2 {{candidate template ignored: constraints not satisfied}} - // expected-note@#MEMOVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} + // expected-note@#MEMOVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}} + // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} S.OtherViableCands3<int>(); // expected-error@-1 {{no matching member function for call to 'OtherViableCands3'}} // expected-note@#MEMOVC3_ALT {{candidate function}} // expected-note@#MEMOVC3 {{candidate template ignored: constraints not satisfied}} - // expected-note@#MEMOVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} + // expected-note@#MEMOVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}} + // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} } struct StaticOVC { @@ -173,12 +181,14 @@ void StaticMemOVCUse() { // expected-error@-1 {{no matching function for call to 'OtherViableCands2'}} // expected-note@#SMEMOVC2_ALT {{candidate function}} // expected-note@#SMEMOVC2 {{candidate template ignored: constraints not satisfied}} - // expected-note@#SMEMOVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} + // expected-note@#SMEMOVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}} + // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} StaticOVC::OtherViableCands3<int>(); // expected-error@-1 {{no matching function for call to 'OtherViableCands3'}} // expected-note@#SMEMOVC3_ALT {{candidate function}} // expected-note@#SMEMOVC3 {{candidate template ignored: constraints not satisfied}} - // expected-note@#SMEMOVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} + // expected-note@#SMEMOVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}} + // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} } namespace GH58548 { diff --git a/clang/test/SemaTemplate/concepts-recursive-inst.cpp b/clang/test/SemaTemplate/concepts-recursive-inst.cpp index 097cad1..73dce93 100644 --- a/clang/test/SemaTemplate/concepts-recursive-inst.cpp +++ b/clang/test/SemaTemplate/concepts-recursive-inst.cpp @@ -12,7 +12,7 @@ void g() { // expected-note@#FDEF{{because 'int' does not satisfy 'c'}} // expected-note@#CDEF{{because 'f(t)' would be invalid: no matching function for call to 'f'}} } -} // namespace GH53213 +} // namespace GH53213 namespace GH45736 { struct constrained; @@ -67,15 +67,14 @@ struct my_range{ void baz() { auto it = begin(rng); // #BEGIN_CALL -// expected-error@#INF_BEGIN {{satisfaction of constraint 'Inf<Inf auto>' depends on itself}} -// expected-note@#INF_BEGIN {{while substituting template arguments into constraint expression here}} +// expected-error-re@#INF_REQ {{satisfaction of constraint {{.*}} depends on itself}} +// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<DirectRecursiveCheck::my_range>' requested here}} // expected-note@#INF_BEGIN_EXPR {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}} // expected-note@#INF_BEGIN_EXPR {{while substituting deduced template arguments into function template 'begin'}} // expected-note@#INF_BEGIN_EXPR {{in instantiation of requirement here}} // expected-note@#INF_REQ {{while substituting template arguments into constraint expression here}} -// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<DirectRecursiveCheck::my_range>' requested here}} -// expected-note@#INF_BEGIN {{while substituting template arguments into constraint expression here}} -// expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}} +// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<struct my_range>' requested here}} +// expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin<struct my_range>' required here}} // expected-note@#BEGIN_CALL {{while substituting deduced template arguments into function template}} // Fallout of the failure is failed lookup, which is necessary to stop odd @@ -83,6 +82,7 @@ auto it = begin(rng); // #BEGIN_CALL // expected-error@#BEGIN_CALL {{no matching function for call to 'begin'}} // expected-note@#NOTINF_BEGIN {{candidate function}} // expected-note@#INF_BEGIN{{candidate template ignored: constraints not satisfied}} +// expected-note@#INF_BEGIN{{because 'Inf auto' does not satisfy 'Inf}} } } // namespace DirectRecursiveCheck @@ -100,16 +100,17 @@ namespace GH50891 { static_assert(Numeric<Deferred>); // #STATIC_ASSERT // expected-error@#NUMERIC{{satisfaction of constraint 'requires (T a) { foo(a); }' depends on itself}} // expected-note@#NUMERIC {{while substituting template arguments into constraint expression here}} - // expected-note@#OP_TO {{while checking the satisfaction of concept 'Numeric<GH50891::Deferred>' requested here}} - // expected-note@#OP_TO {{while substituting template arguments into constraint expression here}} - // expected-note@#FOO_CALL {{while checking constraint satisfaction for template}} - // expected-note@#FOO_CALL {{while substituting deduced template arguments into function template}} - // expected-note@#FOO_CALL {{in instantiation of requirement here}} + // expected-note@#OP_TO {{while checking the satisfaction of concept 'Numeric<Deferred>' requested here}} + // expected-note@#OP_TO {{skipping 1 context}} + // expected-note@#FOO_CALL 2{{while checking constraint satisfaction for template}} + // expected-note@#FOO_CALL 2{{while substituting deduced template arguments into function template}} + // expected-note@#FOO_CALL 2{{in instantiation of requirement here}} // expected-note@#NUMERIC {{while substituting template arguments into constraint expression here}} // expected-error@#STATIC_ASSERT {{static assertion failed}} - // expected-note@#STATIC_ASSERT{{while checking the satisfaction of concept 'Numeric<GH50891::Deferred>' requested here}} - // expected-note@#STATIC_ASSERT{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} + // expected-note@#STATIC_ASSERT{{while checking the satisfaction of concept 'Numeric<Deferred>' requested here}} + // expected-note@#STATIC_ASSERT{{because 'Deferred' does not satisfy 'Numeric'}} + // expected-note@#FOO_CALL{{because 'foo(a)' would be invalid}} } // namespace GH50891 diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp index 209e7dc..6d29f8b 100644 --- a/clang/test/SemaTemplate/concepts.cpp +++ b/clang/test/SemaTemplate/concepts.cpp @@ -1002,7 +1002,7 @@ template<class> concept Irrelevant = false; template <typename T> -concept ErrorRequires = requires(ErrorRequires auto x) { x; }; +concept ErrorRequires = requires(ErrorRequires auto x) { x; }; //#GH54678-ill-formed-concept // expected-error@-1 {{a concept definition cannot refer to itself}} \ // expected-error@-1 {{'auto' not allowed in requires expression parameter}} \ // expected-note@-1 {{declared here}} @@ -1023,8 +1023,7 @@ template<class T> void eee(T t) // expected-note {{candidate template ignored: c requires (Irrelevant<T> || Irrelevant<T> || True<T>) && False<T> {} // expected-note {{'long' does not satisfy 'False'}} template<class T> void fff(T t) // expected-note {{candidate template ignored: constraints not satisfied}} -requires((ErrorRequires<T> || False<T> || True<T>) && False<T>) {} // expected-note {{'unsigned long' does not satisfy 'False'}} - +requires((ErrorRequires<T> || False<T> || True<T>) && False<T>) {} // expected-note {{because 'unsigned long' does not satisfy 'False'}} void test() { aaa(42); // expected-error {{no matching function}} bbb(42L); // expected-error{{no matching function}} @@ -1264,12 +1263,7 @@ C auto x = 0; // expected-error@#T_Type {{type 'int' cannot be used prior to '::'}} \ // expected-note@-1 {{in instantiation of default argument}} -// This will be fixed when we merge https://github.com/llvm/llvm-project/pull/141776 -// Which makes us behave like GCC. static_assert(f(0)); -// expected-error@-1 {{no matching function for call}} \ -// expected-note@#GH61824_f {{constraints not satisfied}} \ -// expected-note@#T_Type {{type 'int' cannot be used prior to '::'}} } @@ -1278,4 +1272,65 @@ template <typename T> concept PerfectSquare = [](){} // expected-note 2{{here}} ([](auto) { return true; }) < PerfectSquare <class T>; // expected-error@-1 {{declaration of 'T' shadows template parameter}} \ // expected-error@-1 {{a concept definition cannot refer to itself}} + +} +namespace GH61811{ +template <class T> struct A { static const int x = 42; }; +template <class Ta> concept A42 = A<Ta>::x == 42; +template <class Tv> concept Void = __is_same_as(Tv, void); +template <class Tb, class Ub> concept A42b = Void<Tb> || A42<Ub>; +template <class Tc> concept R42c = A42b<Tc, Tc&>; +static_assert (R42c<void>); +} + +namespace parameter_mapping_regressions { + +namespace case1 { + +template <template <class> class> using __meval = struct __q; +template <template <class> class _Tp> +concept __mvalid = requires { typename __meval<_Tp>; }; +template <class _Fn> +concept __minvocable = __mvalid<_Fn::template __f>; +template <class...> struct __mdefer_; +template <class _Fn, class... _Args> + requires __minvocable<_Fn> +struct __mdefer_<_Fn, _Args...> {}; +template <class = __q> struct __mtransform { + template <class> using __f = int; +}; +struct __completion_domain_or_none_ : __mdefer_<__mtransform<>> {}; + +} + +namespace case2 { + +template<auto& Q, class P> concept C = Q.template operator()<P>(); +template<class P> concept E = C<[]<class Ty>{ return false; }, P>; +static_assert(!E<int>); + +} + + +namespace case3 { +template <class> constexpr bool is_move_constructible_v = false; + +template <class _Tp> +concept __cpp17_move_constructible = is_move_constructible_v<_Tp>; // #is_move_constructible_v + +template <class _Tp> +concept __cpp17_copy_constructible = __cpp17_move_constructible<_Tp>; // #__cpp17_move_constructible + +template <class _Iter> +concept __cpp17_iterator = __cpp17_copy_constructible<_Iter>; // #__cpp17_copy_constructible + +struct not_move_constructible {}; +static_assert(__cpp17_iterator<not_move_constructible>); \ +// expected-error {{static assertion failed}} \ +// expected-note {{because 'not_move_constructible' does not satisfy '__cpp17_iterator'}} \ +// expected-note@#__cpp17_copy_constructible {{because 'not_move_constructible' does not satisfy '__cpp17_copy_constructible'}} \ +// expected-note@#__cpp17_move_constructible {{because 'parameter_mapping_regressions::case3::not_move_constructible' does not satisfy '__cpp17_move_constructible'}} \ +// expected-note@#is_move_constructible_v {{because 'is_move_constructible_v<parameter_mapping_regressions::case3::not_move_constructible>' evaluated to false}} +} + } diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp index e2b586e..9e5756f 100644 --- a/clang/test/SemaTemplate/deduction-guide.cpp +++ b/clang/test/SemaTemplate/deduction-guide.cpp @@ -574,8 +574,9 @@ static_assert(x.size == 4); // CHECK-NEXT: | |-ParmVarDecl 0x{{.+}} <col:18, col:24> col:21 'U (&)[3]' // CHECK-NEXT: | `-ConceptSpecializationExpr 0x{{.+}} <col:36, col:42> 'bool' Concept 0x{{.+}} 'True' // CHECK-NEXT: | |-ImplicitConceptSpecializationDecl 0x{{.+}} <{{.+}}> col:28 -// CHECK-NEXT: | | `-TemplateArgument type 'type-parameter-0-0' -// CHECK-NEXT: | | `-TemplateTypeParmType 0x{{.+}} 'type-parameter-0-0' dependent depth 0 index 0 +// CHECK-NEXT: | | `-TemplateArgument type 'T' +// CHECK-NEXT: | | `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0 +// CHECK-NEXT: | | `-TemplateTypeParm 0x{{.+}} 'T' // CHECK-NEXT: | `-TemplateArgument <{{.+}}> type 'T':'type-parameter-0-0' // CHECK-NEXT: | `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0 // CHECK-NEXT: | `-TemplateTypeParm 0x{{.+}} 'T' @@ -588,8 +589,9 @@ static_assert(x.size == 4); // CHECK-NEXT: |-ParmVarDecl 0x{{.+}} <col:18, col:24> col:21 'double (&)[3]' // CHECK-NEXT: `-ConceptSpecializationExpr 0x{{.+}} <col:36, col:42> 'bool' Concept 0x{{.+}} 'True' // CHECK-NEXT: |-ImplicitConceptSpecializationDecl 0x{{.+}} <{{.+}}> col:28 -// CHECK-NEXT: | `-TemplateArgument type 'type-parameter-0-0' -// CHECK-NEXT: | `-TemplateTypeParmType 0x{{.+}} 'type-parameter-0-0' dependent depth 0 index 0 +// CHECK-NEXT: | `-TemplateArgument type 'T' +// CHECK-NEXT: | `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0 +// CHECK-NEXT: | `-TemplateTypeParm 0x{{.+}} 'T' // CHECK-NEXT: `-TemplateArgument <{{.+}}> type 'T':'type-parameter-0-0' // CHECK-NEXT: `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0 // CHECK-NEXT: `-TemplateTypeParm 0x{{.+}} 'T' @@ -660,8 +662,9 @@ Test test(42); // CHECK-NEXT: |-TemplateTypeParmDecl {{.*}} Concept {{.*}} 'Constraint' depth 0 index 1 auto:1 // CHECK-NEXT: | `-ConceptSpecializationExpr {{.*}} 'bool' Concept {{.*}} 'Constraint' // CHECK-NEXT: | |-ImplicitConceptSpecializationDecl {{.*}} -// CHECK-NEXT: | | |-TemplateArgument type 'type-parameter-0-1' -// CHECK-NEXT: | | | `-TemplateTypeParmType {{.*}} 'type-parameter-0-1' dependent depth 0 index 1 +// CHECK-NEXT: | | |-TemplateArgument type 'auto:1' +// CHECK-NEXT: | | | `-TemplateTypeParmType {{.*}} 'auto:1' dependent depth 0 index 1 +// CHECK-NEXT: | | | `-TemplateTypeParm {{.*}} 'auto:1' // CHECK-NEXT: | | `-TemplateArgument type 'int' // CHECK-NEXT: | | `-BuiltinType {{.*}} 'int' // CHECK-NEXT: | |-TemplateArgument {{.*}} type 'auto:1':'type-parameter-0-1' diff --git a/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp b/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp index 1f2171a..e03756e 100644 --- a/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp +++ b/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp @@ -1,5 +1,6 @@ // RUN: %clang_cc1 -std=c++2a -x c++ %s -verify + template<typename...> concept C = false; // expected-note 9{{because}} diff --git a/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp b/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp index 3edf243..de4a484 100644 --- a/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp +++ b/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp @@ -7,8 +7,7 @@ template<typename T> constexpr bool is_same_v<T, T> = true; template<typename T, typename U> -concept same_as = is_same_v<T, U>; -// expected-note@-1{{because 'is_same_v<int, bool>' evaluated to false}} +concept same_as = is_same_v<T, U>; //#is_same_v template<typename T, typename... Us> concept either = (is_same_v<T, Us> || ...); @@ -17,6 +16,7 @@ template<typename... Ts> struct T { template<same_as<Ts>... Us> // expected-note@-1{{because 'same_as<int, bool>' evaluated to false}} + // expected-note@#is_same_v{{because 'is_same_v<int, bool>' evaluated to false}} static void foo(Us... u, int x) { }; // expected-note@-1{{candidate template ignored: deduced too few arguments}} // expected-note@-2{{candidate template ignored: constraints not satisfied}} diff --git a/clang/test/SemaTemplate/instantiate-requires-expr.cpp b/clang/test/SemaTemplate/instantiate-requires-expr.cpp index e60f792..32ad537 100644 --- a/clang/test/SemaTemplate/instantiate-requires-expr.cpp +++ b/clang/test/SemaTemplate/instantiate-requires-expr.cpp @@ -72,12 +72,12 @@ namespace type_requirement { template<typename T> requires false_v<requires { typename T::template temp<T>; }> - // expected-note@-1 {{because 'false_v<requires { typename type_requirement::contains_template<int>::template temp<type_requirement::contains_template<int>>; }>' evaluated to false}} - // expected-note@-2 {{because 'false_v<requires { typename type_requirement::contains_template<short>::template temp<type_requirement::contains_template<short>>; }>' evaluated to false}} + // expected-note@-1 {{because 'false_v<requires { typename contains_template<int>::template temp<contains_template<int>>; }>' evaluated to false}} + // expected-note@-2 {{because 'false_v<requires { typename contains_template<short>::template temp<contains_template<short>>; }>' evaluated to false}} struct r2 {}; - using r2i1 = r2<contains_template<int>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = type_requirement::contains_template<int>]}} - using r2i2 = r2<contains_template<short>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = type_requirement::contains_template<short>]}} + using r2i1 = r2<contains_template<int>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = contains_template<int>]}} + using r2i2 = r2<contains_template<short>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = contains_template<short>]}} // substitution error occurs, then requires expr is instantiated again @@ -108,7 +108,7 @@ namespace type_requirement { // expected-note@-1 {{because 'false_v<requires { <<error-type>>; } && requires { <<error-type>>; }>' evaluated to false}} struct r7 {}; - using r7i = r7<int, A>; // expected-error{{constraints not satisfied for class template 'r7' [with Ts = <int, type_requirement::A>]}} + using r7i = r7<int, A>; // expected-error{{constraints not satisfied for class template 'r7' [with Ts = <int, A>]}} } namespace expr_requirement { @@ -268,3 +268,13 @@ struct Foo { }; } // namespace GH110785 + +namespace sugared_instantiation { + template <class C1> concept C = requires { C1{}; }; + template <class D1> concept D = requires { new D1; }; + + // Test that 'deduced auto' doesn't get confused with 'undeduced auto'. + auto f() { return 0; } + static_assert(requires { { f() } -> C; }); + static_assert(requires { { f() } -> D; }); +} // namespace sugared_instantiation diff --git a/clang/test/SemaTemplate/instantiate-template-argument.cpp b/clang/test/SemaTemplate/instantiate-template-argument.cpp index 43d5d00..7606619 100644 --- a/clang/test/SemaTemplate/instantiate-template-argument.cpp +++ b/clang/test/SemaTemplate/instantiate-template-argument.cpp @@ -1,4 +1,6 @@ -// RUN: %clang_cc1 -std=c++2a -x c++ %s -verify +// RUN: %clang_cc1 -std=c++2a -x c++ %s -verify=expected,cxx20 +// RUN: %clang_cc1 -std=c++2c -x c++ %s -verify + template<auto T, decltype(T) U> concept C1 = sizeof(U) >= 4; @@ -9,20 +11,101 @@ concept C2 = C1<Y{}, V>; // sizeof(U) >= 4 [U = V (decltype(Y{}))] template<char W> -constexpr int foo() requires C2<int, W> { return 1; } +constexpr int foo() requires C2<int, W> { return 1; } // #cand1 // sizeof(U) >= 4 [U = W (decltype(int{}))] template<char X> -// expected-note@+1{{candidate function}} -constexpr int foo() requires C1<1, X> && true { return 2; } +constexpr int foo() requires C1<1, X> && true { return 2; } // #cand2 // sizeof(U) >= 4 [U = X (decltype(1))] static_assert(foo<'a'>() == 2); + template<char Z> -// expected-note@+1{{candidate function}} -constexpr int foo() requires C2<long long, Z> && true { return 3; } +constexpr int foo() requires C2<long long, Z> && true { return 3; } // #cand3 // sizeof(U) >= 4 [U = Z (decltype(long long{}))] static_assert(foo<'a'>() == 3); -// expected-error@-1{{call to 'foo' is ambiguous}}
\ No newline at end of file +// expected-error@-1{{call to 'foo' is ambiguous}} +// expected-note@#cand2 {{candidate function}} +// expected-note@#cand3 {{candidate function}} + + +namespace case1 { + +template<auto T, decltype(T) U> +concept C1 = sizeof(T) >= 4; // #case1_C1 + +template<typename Y, char V> +concept C2 = C1<Y{}, V>; // #case1_C2 + +template<class T, char W> +constexpr int foo() requires C2<T, W> { return 1; } // #case1_foo1 + +template<class T, char X> +constexpr int foo() requires C1<T{}, X> && true { return 2; } // #case1_foo2 + +static_assert(foo<char, 'a'>() == 2); +// expected-error@-1{{no matching function for call to 'foo'}} +// expected-note@#case1_foo1{{candidate template ignored: constraints not satisfied [with T = char, W = 'a']}} +// expected-note@#case1_foo1{{because 'C2<char, 'a'>' evaluated to false}} +// expected-note@#case1_C2{{because 'C1<char{}, 'a'>' evaluated to false}} +// expected-note@#case1_C1{{because 'sizeof ('\x00') >= 4' (1 >= 4) evaluated to false}} +// expected-note@#case1_foo2{{candidate template ignored: constraints not satisfied [with T = char, X = 'a']}} +// expected-note@#case1_foo2{{because 'C1<char{}, 'a'>' evaluated to false}} +// expected-note@#case1_C1{{because 'sizeof ('\x00') >= 4' (1 >= 4) evaluated to false}} + +static_assert(foo<int, 'a'>() == 2); + +} + +namespace packs { + +template<auto T, decltype(T) U> +concept C1 = sizeof(U) >= 4; + +template<typename Y, char V> +concept C2 = C1<Y{}, V>; + +template<char... W> +constexpr int foo() requires (C2<int, W> && ...) { return 1; } // #packs-cand1 + +template<char... X> +constexpr int foo() requires (C1<1, X> && ...) && true { return 2; } // #packs-cand2 + +static_assert(foo<'a'>() == 2); +// cxx20-error@-1{{call to 'foo' is ambiguous}} +// cxx20-note@#packs-cand1 {{candidate function}} +// cxx20-note@#packs-cand2 {{candidate function}} + +} + +namespace case2 { +template<auto T> concept C1 = sizeof(decltype(T)) >= 0; +template<typename Y> concept C2 = C1<Y{}>; + +template<char W> +constexpr int foo() requires C2<int> { return 1; } + +template<char X> +constexpr int foo() requires C1<0> && true { return 2; } + +static_assert(foo<0>() == 2); +} + +namespace case3 { +template<auto T> concept C1 = sizeof(decltype(T)) >= 0; + +template<typename Y> concept C2 = C1<Y{}>; + +template<char W> +constexpr int foo() requires C2<int> { return 1; } // #case3_foo1 + +template<char X> +constexpr int foo() requires C1<1> && true { return 2; } // #case3_foo2 + +static_assert(foo<0>() == 2); +// expected-error@-1{{call to 'foo' is ambiguous}} +// expected-note@#case3_foo1 {{candidate function}} +// expected-note@#case3_foo2 {{candidate function}} +} diff --git a/clang/test/SemaTemplate/pr52970.cpp b/clang/test/SemaTemplate/pr52970.cpp index 7aac5ee..6aabc41 100644 --- a/clang/test/SemaTemplate/pr52970.cpp +++ b/clang/test/SemaTemplate/pr52970.cpp @@ -53,7 +53,7 @@ static_assert(!DotFollowingPointer::f(Bad{}), ""); #if __cplusplus >= 202002L template <class T> concept C = requires(T t) { t.begin(); }; - // cxx20-note@-1 {{because 't.begin()' would be invalid: member reference type 'Holder<Incomplete> *' is a pointer}} + // cxx20-note@-1 {{because 't.begin()' would be invalid: member reference type 'Bad' (aka 'Holder<Incomplete> *') is a pointer}} static_assert(C<Good>); static_assert(!C<Bad>); diff --git a/flang-rt/lib/runtime/character.cpp b/flang-rt/lib/runtime/character.cpp index 98a225d..0f9f419 100644 --- a/flang-rt/lib/runtime/character.cpp +++ b/flang-rt/lib/runtime/character.cpp @@ -789,7 +789,7 @@ void RTDEF(LenTrim)(Descriptor &result, const Descriptor &string, int kind, std::size_t RTDEF(Scan1)(const char *x, std::size_t xLen, const char *set, std::size_t setLen, bool back) { - return ScanVerify<char, CharFunc::Scan>(x, xLen, set, setLen, back); + return ScanVerify<false>(x, xLen, set, setLen, back); } std::size_t RTDEF(Scan2)(const char16_t *x, std::size_t xLen, const char16_t *set, std::size_t setLen, bool back) { @@ -873,7 +873,7 @@ void RTDEF(Trim)(Descriptor &result, const Descriptor &string, std::size_t RTDEF(Verify1)(const char *x, std::size_t xLen, const char *set, std::size_t setLen, bool back) { - return ScanVerify<char, CharFunc::Verify>(x, xLen, set, setLen, back); + return ScanVerify<true>(x, xLen, set, setLen, back); } std::size_t RTDEF(Verify2)(const char16_t *x, std::size_t xLen, const char16_t *set, std::size_t setLen, bool back) { diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp index bdf7e4a..e006d2e 100644 --- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp +++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp @@ -285,11 +285,16 @@ mlir::LLVM::DIModuleAttr AddDebugInfoPass::getOrCreateModuleAttr( if (auto iter{moduleMap.find(name)}; iter != moduleMap.end()) { modAttr = iter->getValue(); } else { + // When decl is true, it means that module is only being used in this + // compilation unit and it is defined elsewhere. But if the file/line/scope + // fields are valid, the module is not merged with its definition and is + // considered different. So we only set those fields when decl is false. modAttr = mlir::LLVM::DIModuleAttr::get( - context, fileAttr, scope, mlir::StringAttr::get(context, name), + context, decl ? nullptr : fileAttr, decl ? nullptr : scope, + mlir::StringAttr::get(context, name), /* configMacros */ mlir::StringAttr(), /* includePath */ mlir::StringAttr(), - /* apinotes */ mlir::StringAttr(), line, decl); + /* apinotes */ mlir::StringAttr(), decl ? 0 : line, decl); moduleMap[name] = modAttr; } return modAttr; diff --git a/flang/test/Transforms/debug-module-3.fir b/flang/test/Transforms/debug-module-3.fir new file mode 100644 index 0000000..03cc21e --- /dev/null +++ b/flang/test/Transforms/debug-module-3.fir @@ -0,0 +1,13 @@ +// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s + +module { + func.func @_QQmain() { + %2 = fir.address_of(@_QMmodEvar1) : !fir.ref<i32> loc(#loc1) + %3 = fircg.ext_declare %2 {uniq_name = "_QMmodEvar1"} : (!fir.ref<i32>) -> !fir.ref<i32> loc(#loc1) + return + } loc(#loc1) + fir.global @_QMmodEvar1 : i32 loc(#loc1) +} +#loc1 = loc("test1.f90":1:0) + +// CHECK: #llvm.di_module<name = "mod", isDecl = true> diff --git a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp index 629a887..70341ee 100644 --- a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp +++ b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp @@ -143,7 +143,7 @@ void check_forward_iterator_requirements() { // expected-note@*:* {{because 'not_default_constructible' does not satisfy '__cpp17_default_constructible'}} _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(postincrement_not_ref, ""); // expected-error {{static assertion failed}} #ifndef _AIX - // expected-note-re@*:* {{because type constraint 'convertible_to<{{(valid_iterator<postincrement_not_ref>::)?}}Proxy, const postincrement_not_ref &>' was not satisfied}} + // expected-note-re@*:* {{'convertible_to<{{(valid_iterator<postincrement_not_ref>::)?}}Proxy, const postincrement_not_ref &>'}} #endif } @@ -173,7 +173,7 @@ void check_bidirectional_iterator_requirements() { _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_postdecrement, ""); // expected-error {{static assertion failed}} // expected-note@*:* {{cannot decrement value of type 'missing_postdecrement'}} _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(not_returning_iter_reference, ""); // expected-error {{static assertion failed}} - // expected-note-re@*:* {{because type constraint 'same_as<int, __iter_reference<not_returning_iter_reference>{{ ?}}>' was not satisfied}} + // expected-note-re@*:* {{'same_as<int, __iter_reference<not_returning_iter_reference>{{ ?}}>'}} // clang-format on } diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h index 6168e24..2e31fe5 100644 --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -2773,6 +2773,14 @@ m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2, return m_Intrinsic<Intrinsic::masked_load>(Op0, Op1, Op2, Op3); } +/// Matches MaskedStore Intrinsic. +template <typename Opnd0, typename Opnd1, typename Opnd2, typename Opnd3> +inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2, Opnd3>::Ty +m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2, + const Opnd3 &Op3) { + return m_Intrinsic<Intrinsic::masked_store>(Op0, Op1, Op2, Op3); +} + /// Matches MaskedGather Intrinsic. template <typename Opnd0, typename Opnd1, typename Opnd2, typename Opnd3> inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2, Opnd3>::Ty diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h index 2454149..74a4d6c 100644 --- a/llvm/include/llvm/Transforms/Scalar/GVN.h +++ b/llvm/include/llvm/Transforms/Scalar/GVN.h @@ -56,6 +56,7 @@ class OptimizationRemarkEmitter; class PHINode; class TargetLibraryInfo; class Value; +class IntrinsicInst; /// A private "module" namespace for types and utilities used by GVN. These /// are implementation details and should not be used by clients. namespace LLVM_LIBRARY_VISIBILITY_NAMESPACE gvn { @@ -349,6 +350,7 @@ private: // Helper functions of redundant load elimination. bool processLoad(LoadInst *L); + bool processMaskedLoad(IntrinsicInst *I); bool processNonLocalLoad(LoadInst *L); bool processAssumeIntrinsic(AssumeInst *II); diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp index 23b72da..6e316f1 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp @@ -280,6 +280,9 @@ std::vector<Block *> LinkGraph::splitBlockImpl(std::vector<Block *> Blocks, void LinkGraph::dump(raw_ostream &OS) { DenseMap<Block *, std::vector<Symbol *>> BlockSymbols; + OS << "LinkGraph \"" << getName() + << "\" (triple = " << getTargetTriple().str() << ")\n"; + // Map from blocks to the symbols pointing at them. for (auto *Sym : defined_symbols()) BlockSymbols[&Sym->getBlock()].push_back(Sym); diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp index 584b9f0..17050b0 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp @@ -21,23 +21,21 @@ JITLinkerBase::~JITLinkerBase() = default; void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) { - LLVM_DEBUG({ - dbgs() << "Starting link phase 1 for graph " << G->getName() << "\n"; - }); + LLVM_DEBUG(dbgs() << "Starting link phase 1\n"); // Prune and optimize the graph. if (auto Err = runPasses(Passes.PrePrunePasses)) return Ctx->notifyFailed(std::move(Err)); LLVM_DEBUG({ - dbgs() << "Link graph \"" << G->getName() << "\" pre-pruning:\n"; + dbgs() << "Link graph pre-pruning:\n"; G->dump(dbgs()); }); prune(*G); LLVM_DEBUG({ - dbgs() << "Link graph \"" << G->getName() << "\" post-pruning:\n"; + dbgs() << "Link graph post-pruning:\n"; G->dump(dbgs()); }); @@ -67,14 +65,15 @@ void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) { void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self, AllocResult AR) { + LLVM_DEBUG(dbgs() << "Starting link phase 2\n"); + if (AR) Alloc = std::move(*AR); else return Ctx->notifyFailed(AR.takeError()); LLVM_DEBUG({ - dbgs() << "Link graph \"" << G->getName() - << "\" before post-allocation passes:\n"; + dbgs() << "Link graph before post-allocation passes:\n"; G->dump(dbgs()); }); @@ -131,9 +130,7 @@ void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self, void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self, Expected<AsyncLookupResult> LR) { - LLVM_DEBUG({ - dbgs() << "Starting link phase 3 for graph " << G->getName() << "\n"; - }); + LLVM_DEBUG(dbgs() << "Starting link phase 3\n"); // If the lookup failed, bail out. if (!LR) @@ -143,8 +140,7 @@ void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self, applyLookupResult(*LR); LLVM_DEBUG({ - dbgs() << "Link graph \"" << G->getName() - << "\" before pre-fixup passes:\n"; + dbgs() << "Link graph before pre-fixup passes:\n"; G->dump(dbgs()); }); @@ -152,7 +148,7 @@ void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self, return abandonAllocAndBailOut(std::move(Self), std::move(Err)); LLVM_DEBUG({ - dbgs() << "Link graph \"" << G->getName() << "\" before copy-and-fixup:\n"; + dbgs() << "Link graph before copy-and-fixup:\n"; G->dump(dbgs()); }); @@ -161,7 +157,7 @@ void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self, return abandonAllocAndBailOut(std::move(Self), std::move(Err)); LLVM_DEBUG({ - dbgs() << "Link graph \"" << G->getName() << "\" after copy-and-fixup:\n"; + dbgs() << "Link graph after copy-and-fixup:\n"; G->dump(dbgs()); }); @@ -186,16 +182,14 @@ void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self, void JITLinkerBase::linkPhase4(std::unique_ptr<JITLinkerBase> Self, FinalizeResult FR) { - LLVM_DEBUG({ - dbgs() << "Starting link phase 4 for graph " << G->getName() << "\n"; - }); + LLVM_DEBUG(dbgs() << "Starting link phase 4\n"); if (!FR) return Ctx->notifyFailed(FR.takeError()); Ctx->notifyFinalized(std::move(*FR)); - LLVM_DEBUG({ dbgs() << "Link of graph " << G->getName() << " complete\n"; }); + LLVM_DEBUG({ dbgs() << "Link complete\n"; }); } Error JITLinkerBase::runPasses(LinkGraphPassList &Passes) { diff --git a/llvm/lib/Object/BuildID.cpp b/llvm/lib/Object/BuildID.cpp index 89d6bc3..d1ee597 100644 --- a/llvm/lib/Object/BuildID.cpp +++ b/llvm/lib/Object/BuildID.cpp @@ -24,6 +24,24 @@ using namespace llvm::object; namespace { template <typename ELFT> BuildIDRef getBuildID(const ELFFile<ELFT> &Obj) { + auto findBuildID = [&Obj](const auto &ShdrOrPhdr, + uint64_t Alignment) -> std::optional<BuildIDRef> { + Error Err = Error::success(); + for (auto N : Obj.notes(ShdrOrPhdr, Err)) + if (N.getType() == ELF::NT_GNU_BUILD_ID && + N.getName() == ELF::ELF_NOTE_GNU) + return N.getDesc(Alignment); + consumeError(std::move(Err)); + return std::nullopt; + }; + + auto Sections = cantFail(Obj.sections()); + for (const auto &S : Sections) { + if (S.sh_type != ELF::SHT_NOTE) + continue; + if (std::optional<BuildIDRef> ShdrRes = findBuildID(S, S.sh_addralign)) + return ShdrRes.value(); + } auto PhdrsOrErr = Obj.program_headers(); if (!PhdrsOrErr) { consumeError(PhdrsOrErr.takeError()); @@ -32,12 +50,8 @@ template <typename ELFT> BuildIDRef getBuildID(const ELFFile<ELFT> &Obj) { for (const auto &P : *PhdrsOrErr) { if (P.p_type != ELF::PT_NOTE) continue; - Error Err = Error::success(); - for (auto N : Obj.notes(P, Err)) - if (N.getType() == ELF::NT_GNU_BUILD_ID && - N.getName() == ELF::ELF_NOTE_GNU) - return N.getDesc(P.p_align); - consumeError(std::move(Err)); + if (std::optional<BuildIDRef> PhdrRes = findBuildID(P, P.p_align)) + return PhdrRes.value(); } return {}; } diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 8d6eb91..4357264d 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -282,7 +282,7 @@ static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects", static cl::opt<bool> SplitSVEObjects("aarch64-split-sve-objects", cl::desc("Split allocation of ZPR & PPR objects"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); cl::opt<bool> EnableHomogeneousPrologEpilog( "homogeneous-prolog-epilog", cl::Hidden, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 8c4b4f6..50a8754 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5632,75 +5632,94 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp, TTI::TargetCostKind CostKind) const { InstructionCost Invalid = InstructionCost::getInvalid(); - InstructionCost Cost(TTI::TCC_Basic); if (CostKind != TTI::TCK_RecipThroughput) return Invalid; - // Sub opcodes currently only occur in chained cases. - // Independent partial reduction subtractions are still costed as an add + if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() && + (!ST->isNeonAvailable() || !ST->hasDotProd())) + return Invalid; + if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) || OpAExtend == TTI::PR_None) return Invalid; + assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) && + (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) && + "Unexpected values for OpBExtend or InputTypeB"); + // We only support multiply binary operations for now, and for muls we // require the types being extended to be the same. - // NOTE: For muls AArch64 supports lowering mixed extensions to a usdot but - // only if the i8mm or sve/streaming features are available. - if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB || - OpBExtend == TTI::PR_None || - (OpAExtend != OpBExtend && !ST->hasMatMulInt8() && - !ST->isSVEorStreamingSVEAvailable()))) + if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB)) return Invalid; - assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) && - "Unexpected values for OpBExtend or InputTypeB"); - EVT InputEVT = EVT::getEVT(InputTypeA); - EVT AccumEVT = EVT::getEVT(AccumType); + bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend; + if (IsUSDot && !ST->hasMatMulInt8()) + return Invalid; + + unsigned Ratio = + AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits(); + if (VF.getKnownMinValue() <= Ratio) + return Invalid; + + VectorType *InputVectorType = VectorType::get(InputTypeA, VF); + VectorType *AccumVectorType = + VectorType::get(AccumType, VF.divideCoefficientBy(Ratio)); + // We don't yet support all kinds of legalization. + auto TA = TLI->getTypeAction(AccumVectorType->getContext(), + EVT::getEVT(AccumVectorType)); + switch (TA) { + default: + return Invalid; + case TargetLowering::TypeLegal: + case TargetLowering::TypePromoteInteger: + case TargetLowering::TypeSplitVector: + break; + } + + // Check what kind of type-legalisation happens. + std::pair<InstructionCost, MVT> AccumLT = + getTypeLegalizationCost(AccumVectorType); + std::pair<InstructionCost, MVT> InputLT = + getTypeLegalizationCost(InputVectorType); - unsigned VFMinValue = VF.getKnownMinValue(); + InstructionCost Cost = InputLT.first * TTI::TCC_Basic; - if (VF.isScalable()) { - if (!ST->isSVEorStreamingSVEAvailable()) - return Invalid; + // Prefer using full types by costing half-full input types as more expensive. + if (TypeSize::isKnownLT(InputVectorType->getPrimitiveSizeInBits(), + TypeSize::getScalable(128))) + // FIXME: This can be removed after the cost of the extends are folded into + // the dot-product expression in VPlan, after landing: + // https://github.com/llvm/llvm-project/pull/147302 + Cost *= 2; - // Don't accept a partial reduction if the scaled accumulator is vscale x 1, - // since we can't lower that type. - unsigned Scale = - AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits(); - if (VFMinValue == Scale) - return Invalid; + if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) { + // i16 -> i64 is natively supported for udot/sdot + if (AccumLT.second.getScalarType() == MVT::i64 && + InputLT.second.getScalarType() == MVT::i16) + return Cost; + // i8 -> i64 is supported with an extra level of extends + if (AccumLT.second.getScalarType() == MVT::i64 && + InputLT.second.getScalarType() == MVT::i8) + // FIXME: This cost should probably be a little higher, e.g. Cost + 2 + // because it requires two extra extends on the inputs. But if we'd change + // that now, a regular reduction would be cheaper because the costs of + // the extends in the IR are still counted. This can be fixed + // after https://github.com/llvm/llvm-project/pull/147302 has landed. + return Cost; } - if (VF.isFixed() && - (!ST->isNeonAvailable() || !ST->hasDotProd() || AccumEVT == MVT::i64)) - return Invalid; - if (InputEVT == MVT::i8) { - switch (VFMinValue) { - default: - return Invalid; - case 8: - if (AccumEVT == MVT::i32) - Cost *= 2; - else if (AccumEVT != MVT::i64) - return Invalid; - break; - case 16: - if (AccumEVT == MVT::i64) - Cost *= 2; - else if (AccumEVT != MVT::i32) - return Invalid; - break; - } - } else if (InputEVT == MVT::i16) { - // FIXME: Allow i32 accumulator but increase cost, as we would extend - // it to i64. - if (VFMinValue != 8 || AccumEVT != MVT::i64) - return Invalid; - } else - return Invalid; + // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE. + if (ST->isSVEorStreamingSVEAvailable() || + (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() && + ST->hasDotProd())) { + if (AccumLT.second.getScalarType() == MVT::i32 && + InputLT.second.getScalarType() == MVT::i8) + return Cost; + } - return Cost; + // Add additional cost for the extends that would need to be inserted. + return Cost + 4; } InstructionCost diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 6efa78e..a4ef524 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -608,8 +608,6 @@ public: ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot] : EmptySet; - const size_t HybridModuleRootKernelsSize = HybridModuleRootKernels.size(); - for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { // Each iteration of this loop assigns exactly one global variable to // exactly one of the implementation strategies. @@ -649,8 +647,7 @@ public: ModuleScopeVariables.insert(GV); } else if (K.second.size() == 1) { KernelAccessVariables.insert(GV); - } else if (K.second.size() == HybridModuleRootKernelsSize && - set_is_subset(K.second, HybridModuleRootKernels)) { + } else if (K.second == HybridModuleRootKernels) { ModuleScopeVariables.insert(GV); } else { TableLookupVariables.insert(GV); diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 76bfce8..5e27b37 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1013,6 +1013,15 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { } } } else if (T == X_CNT) { + WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP; + if (PendingEvents & (1 << OtherEvent)) { + // Hardware inserts an implicit xcnt between interleaved + // SMEM and VMEM operations. So there will never be + // outstanding address translations for both SMEM and + // VMEM at the same time. + setScoreLB(T, CurrScore - 1); + PendingEvents &= ~(1 << OtherEvent); + } for (const MachineOperand &Op : Inst.all_uses()) setScoreByOperand(&Inst, Op, T, CurrScore); } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ { @@ -2220,6 +2229,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, // Now look at the instruction opcode. If it is a memory access // instruction, update the upper-bound of the appropriate counter's // bracket and the destination operand scores. + // For architectures with X_CNT, mark the source address operands + // with the appropriate counter values. // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere. bool IsVMEMAccess = false; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 8f1dd62..5630580 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1163,6 +1163,22 @@ def VS_64_Lo256 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, let HasSGPR = 1; let Size = 64; } + +def VS_128 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32, + (add VReg_128, SReg_128)> { + let isAllocatable = 0; + let HasVGPR = 1; + let HasSGPR = 1; + let Size = 128; +} + +def VS_128_Align2 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32, + (add VReg_128_Align2, SReg_128)> { + let isAllocatable = 0; + let HasVGPR = 1; + let HasSGPR = 1; + let Size = 128; +} } // End GeneratePressureSet = 0 // Define a register tuple class, along with one requiring an even diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index fa130a1..26ff54c 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -775,6 +775,16 @@ class VectorType; bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override; + /// Return true if it is profitable to fold a pair of shifts into a mask. + bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override { + EVT VT = Y.getValueType(); + + if (VT.isVector()) + return false; + + return VT.getScalarSizeInBits() <= 32; + } + bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override; diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td index af1ceb6..cf6f83a 100644 --- a/llvm/lib/Target/RISCV/RISCVGISel.td +++ b/llvm/lib/Target/RISCV/RISCVGISel.td @@ -110,16 +110,16 @@ def : StPat<truncstorei8, SB, GPR, i16>; let Predicates = [HasAtomicLdSt] in { // Prefer unsigned due to no c.lb in Zcb. - def : LdPat<atomic_load_aext_8, LBU, i16>; - def : LdPat<atomic_load_nonext_16, LH, i16>; + def : LdPat<relaxed_load<atomic_load_aext_8>, LBU, i16>; + def : LdPat<relaxed_load<atomic_load_nonext_16>, LH, i16>; - def : StPat<atomic_store_8, SB, GPR, i16>; - def : StPat<atomic_store_16, SH, GPR, i16>; + def : StPat<relaxed_store<atomic_store_8>, SB, GPR, i16>; + def : StPat<relaxed_store<atomic_store_16>, SH, GPR, i16>; } let Predicates = [HasAtomicLdSt, IsRV64] in { // Load pattern is in RISCVInstrInfoA.td and shared with RV32. - def : StPat<atomic_store_32, SW, GPR, i32>; + def : StPat<relaxed_store<atomic_store_32>, SW, GPR, i32>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp index 143c4c4..e7709ef 100644 --- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp @@ -149,6 +149,10 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, }); } + getActionDefinitionsBuilder({G_UMIN, G_UMAX, G_SMIN, G_SMAX}) + .widenScalarToNextPow2(0, /*Min=*/32) + .lower(); + // integer addition/subtraction getActionDefinitionsBuilder({G_ADD, G_SUB}) .legalFor({s8, s16, s32}) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index cda5568..3802506 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45457,7 +45457,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget) { EVT SrcVT = Src.getValueType(); - if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1) + if (Subtarget.useSoftFloat() || !SrcVT.isSimple() || + SrcVT.getScalarType() != MVT::i1) return SDValue(); // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 26e17cc..b9b5b58 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -2287,6 +2287,35 @@ bool GVNPass::processLoad(LoadInst *L) { return true; } +// Attempt to process masked loads which have loaded from +// masked stores with the same mask +bool GVNPass::processMaskedLoad(IntrinsicInst *I) { + if (!MD) + return false; + MemDepResult Dep = MD->getDependency(I); + Instruction *DepInst = Dep.getInst(); + if (!DepInst || !Dep.isLocal() || !Dep.isDef()) + return false; + + Value *Mask = I->getOperand(2); + Value *Passthrough = I->getOperand(3); + Value *StoreVal; + if (!match(DepInst, m_MaskedStore(m_Value(StoreVal), m_Value(), m_Value(), + m_Specific(Mask))) || + StoreVal->getType() != I->getType()) + return false; + + // Remove the load but generate a select for the passthrough + Value *OpToForward = llvm::SelectInst::Create(Mask, StoreVal, Passthrough, "", + I->getIterator()); + + ICF->removeUsersOf(I); + I->replaceAllUsesWith(OpToForward); + salvageAndRemoveInstruction(I); + ++NumGVNLoad; + return true; +} + /// Return a pair the first field showing the value number of \p Exp and the /// second field showing whether it is a value number newly created. std::pair<uint32_t, bool> @@ -2734,6 +2763,10 @@ bool GVNPass::processInstruction(Instruction *I) { return false; } + if (match(I, m_Intrinsic<Intrinsic::masked_load>()) && + processMaskedLoad(cast<IntrinsicInst>(I))) + return true; + // For conditional branches, we can perform simple conditional propagation on // the condition value itself. if (BranchInst *BI = dyn_cast<BranchInst>(I)) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 7750687..cb6bfb2 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8694,7 +8694,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { Plan->addVF(VF); if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( - Plan, + *Plan, [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index f76777b..ca63bf3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -45,13 +45,13 @@ static cl::opt<bool> EnableWideActiveLaneMask( cl::desc("Enable use of wide get active lane mask instructions")); bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( - VPlanPtr &Plan, + VPlan &Plan, function_ref<const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI) { ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( - Plan->getVectorLoopRegion()); + Plan.getVectorLoopRegion()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { // Skip blocks outside region if (!VPBB->getParent()) @@ -77,11 +77,11 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( for (VPValue *Op : PhiR->operands()) NewRecipe->addOperand(Op); } else { - VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue()); + VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue()); VPValue *Step = - vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep()); + vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep()); NewRecipe = new VPWidenIntOrFpInductionRecipe( - Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc()); + Phi, Start, Step, &Plan.getVF(), *II, Ingredient.getDebugLoc()); } } else { assert(isa<VPInstruction>(&Ingredient) && diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 4c65cb7..2f00e51 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -138,7 +138,7 @@ struct VPlanTransforms { /// widen recipes. Returns false if any VPInstructions could not be converted /// to a wide recipe if needed. LLVM_ABI_FOR_TEST static bool tryToConvertVPInstructionsToVPRecipes( - VPlanPtr &Plan, + VPlan &Plan, function_ref<const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI); diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt index 6f98eae..8399292 100644 --- a/llvm/runtimes/CMakeLists.txt +++ b/llvm/runtimes/CMakeLists.txt @@ -507,14 +507,10 @@ if(build_runtimes) endif() # Forward user-provived system configuration to runtimes for requirement introspection. - # CMAKE_PREFIX_PATH is the search path for CMake packages. In order to pass through - # the command line interface, the CMake semicolon separator needs to be replaced - # with $<SEMICOLON> + # CMAKE_PREFIX_PATH is the search path for CMake packages. if(CMAKE_PREFIX_PATH) - string(JOIN "$<SEMICOLON>" escaped_cmake_prefix_path ${CMAKE_PREFIX_PATH}) - list(APPEND extra_cmake_args "-DCMAKE_PREFIX_PATH=${escaped_cmake_prefix_path}") + list(APPEND extra_cmake_args "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}") endif() - # CMAKE_PROGRAM_PATH is the search path for executables such as python. if(CMAKE_PROGRAM_PATH) list(APPEND extra_cmake_args "-DCMAKE_PROGRAM_PATH=${CMAKE_PROGRAM_PATH}") diff --git a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll index 1de8d0a..01e3d3a 100644 --- a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll +++ b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll @@ -68,13 +68,12 @@ entry: } ; SVE calling conventions -; Predicate register spills end up in FP region, currently. This can be -; mitigated with the -aarch64-enable-zpr-predicate-spills option. +; Padding is placed between predicate and fpr/zpr register spills, so only emit remarks when hazard padding is off. +; Note: The -aarch64-enable-zpr-predicate-spills option is deprecated (and will be removed soon). define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) #2 { ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at [SP-64-258 * vscale] is too close to FPR stack object at [SP-64-256 * vscale] ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64] -; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at [SP-1088-258 * vscale] is too close to FPR stack object at [SP-1088-256 * vscale] ; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'svecc_call': ; CHECK-ZPR-PRED-SPILLS-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object ; CHECK-ZPR-PRED-SPILLS: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64] @@ -89,7 +88,6 @@ entry: define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) #2 { ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-64-258 * vscale] is too close to FPR stack object at [SP-64-256 * vscale] ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64] -; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-1088-258 * vscale] is too close to FPR stack object at [SP-1088-256 * vscale] ; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': ; CHECK-ZPR-PRED-SPILLS-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object ; CHECK-ZPR-PRED-SPILLS: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64] diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll index 333a8be..bdee359 100644 --- a/llvm/test/CodeGen/AArch64/stack-hazard.ll +++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=0 | FileCheck %s --check-prefixes=CHECK,CHECK0 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=64 | FileCheck %s --check-prefixes=CHECK,CHECK64 -; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-NOSPLITSVE -; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-split-sve-objects -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-SPLITSVE +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-split-sve-objects=false -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-NOSPLITSVE +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-SPLITSVE define i32 @basic(i32 noundef %num) { ; CHECK-LABEL: basic: @@ -1940,23 +1940,22 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, ; ; CHECK64-LABEL: svecc_call: ; CHECK64: // %bb.0: // %entry -; CHECK64-NEXT: sub sp, sp, #128 -; CHECK64-NEXT: .cfi_def_cfa_offset 128 +; CHECK64-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 64 ; CHECK64-NEXT: cntd x9 -; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill -; CHECK64-NEXT: stp x27, x26, [sp, #96] // 16-byte Folded Spill -; CHECK64-NEXT: str x19, [sp, #112] // 8-byte Folded Spill -; CHECK64-NEXT: add x29, sp, #64 +; CHECK64-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK64-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK64-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill +; CHECK64-NEXT: mov x29, sp ; CHECK64-NEXT: .cfi_def_cfa w29, 64 -; CHECK64-NEXT: .cfi_offset w19, -16 -; CHECK64-NEXT: .cfi_offset w26, -24 -; CHECK64-NEXT: .cfi_offset w27, -32 -; CHECK64-NEXT: .cfi_offset w28, -40 +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w26, -16 +; CHECK64-NEXT: .cfi_offset w27, -24 +; CHECK64-NEXT: .cfi_offset w28, -32 ; CHECK64-NEXT: .cfi_offset vg, -48 ; CHECK64-NEXT: .cfi_offset w30, -56 ; CHECK64-NEXT: .cfi_offset w29, -64 -; CHECK64-NEXT: addvl sp, sp, #-18 +; CHECK64-NEXT: addvl sp, sp, #-2 ; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill @@ -1969,30 +1968,32 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, ; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128 +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #-16 +; CHECK64-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128 ; CHECK64-NEXT: sub sp, sp, #64 ; CHECK64-NEXT: mov x8, x0 ; CHECK64-NEXT: bl __arm_sme_state @@ -2014,22 +2015,32 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, ; CHECK64-NEXT: mov w0, #22647 // =0x5877 ; CHECK64-NEXT: movk w0, #59491, lsl #16 ; CHECK64-NEXT: add sp, sp, #64 -; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #16 +; CHECK64-NEXT: .cfi_restore z8 +; CHECK64-NEXT: .cfi_restore z9 +; CHECK64-NEXT: .cfi_restore z10 +; CHECK64-NEXT: .cfi_restore z11 +; CHECK64-NEXT: .cfi_restore z12 +; CHECK64-NEXT: .cfi_restore z13 +; CHECK64-NEXT: .cfi_restore z14 +; CHECK64-NEXT: .cfi_restore z15 ; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload @@ -2042,20 +2053,11 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, ; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: addvl sp, sp, #18 -; CHECK64-NEXT: .cfi_restore z8 -; CHECK64-NEXT: .cfi_restore z9 -; CHECK64-NEXT: .cfi_restore z10 -; CHECK64-NEXT: .cfi_restore z11 -; CHECK64-NEXT: .cfi_restore z12 -; CHECK64-NEXT: .cfi_restore z13 -; CHECK64-NEXT: .cfi_restore z14 -; CHECK64-NEXT: .cfi_restore z15 -; CHECK64-NEXT: .cfi_def_cfa wsp, 128 -; CHECK64-NEXT: ldp x26, x19, [sp, #104] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x28, x27, [sp, #88] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #128 +; CHECK64-NEXT: addvl sp, sp, #2 +; CHECK64-NEXT: .cfi_def_cfa wsp, 64 +; CHECK64-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload ; CHECK64-NEXT: .cfi_def_cfa_offset 0 ; CHECK64-NEXT: .cfi_restore w19 ; CHECK64-NEXT: .cfi_restore w26 @@ -2463,23 +2465,22 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8 ; ; CHECK64-LABEL: svecc_alloca_call: ; CHECK64: // %bb.0: // %entry -; CHECK64-NEXT: sub sp, sp, #128 -; CHECK64-NEXT: .cfi_def_cfa_offset 128 +; CHECK64-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 64 ; CHECK64-NEXT: cntd x9 -; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill -; CHECK64-NEXT: stp x27, x26, [sp, #96] // 16-byte Folded Spill -; CHECK64-NEXT: str x19, [sp, #112] // 8-byte Folded Spill -; CHECK64-NEXT: add x29, sp, #64 +; CHECK64-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK64-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK64-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill +; CHECK64-NEXT: mov x29, sp ; CHECK64-NEXT: .cfi_def_cfa w29, 64 -; CHECK64-NEXT: .cfi_offset w19, -16 -; CHECK64-NEXT: .cfi_offset w26, -24 -; CHECK64-NEXT: .cfi_offset w27, -32 -; CHECK64-NEXT: .cfi_offset w28, -40 +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w26, -16 +; CHECK64-NEXT: .cfi_offset w27, -24 +; CHECK64-NEXT: .cfi_offset w28, -32 ; CHECK64-NEXT: .cfi_offset vg, -48 ; CHECK64-NEXT: .cfi_offset w30, -56 ; CHECK64-NEXT: .cfi_offset w29, -64 -; CHECK64-NEXT: addvl sp, sp, #-18 +; CHECK64-NEXT: addvl sp, sp, #-2 ; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill @@ -2492,30 +2493,32 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8 ; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128 +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #-16 +; CHECK64-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128 ; CHECK64-NEXT: sub sp, sp, #112 ; CHECK64-NEXT: bl __arm_sme_state ; CHECK64-NEXT: mov x19, x0 @@ -2536,22 +2539,32 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8 ; CHECK64-NEXT: mov w0, #22647 // =0x5877 ; CHECK64-NEXT: movk w0, #59491, lsl #16 ; CHECK64-NEXT: add sp, sp, #112 -; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #16 +; CHECK64-NEXT: .cfi_restore z8 +; CHECK64-NEXT: .cfi_restore z9 +; CHECK64-NEXT: .cfi_restore z10 +; CHECK64-NEXT: .cfi_restore z11 +; CHECK64-NEXT: .cfi_restore z12 +; CHECK64-NEXT: .cfi_restore z13 +; CHECK64-NEXT: .cfi_restore z14 +; CHECK64-NEXT: .cfi_restore z15 ; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload @@ -2564,20 +2577,11 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8 ; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: addvl sp, sp, #18 -; CHECK64-NEXT: .cfi_restore z8 -; CHECK64-NEXT: .cfi_restore z9 -; CHECK64-NEXT: .cfi_restore z10 -; CHECK64-NEXT: .cfi_restore z11 -; CHECK64-NEXT: .cfi_restore z12 -; CHECK64-NEXT: .cfi_restore z13 -; CHECK64-NEXT: .cfi_restore z14 -; CHECK64-NEXT: .cfi_restore z15 -; CHECK64-NEXT: .cfi_def_cfa wsp, 128 -; CHECK64-NEXT: ldp x26, x19, [sp, #104] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x28, x27, [sp, #88] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #128 +; CHECK64-NEXT: addvl sp, sp, #2 +; CHECK64-NEXT: .cfi_def_cfa wsp, 64 +; CHECK64-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload ; CHECK64-NEXT: .cfi_def_cfa_offset 0 ; CHECK64-NEXT: .cfi_restore w19 ; CHECK64-NEXT: .cfi_restore w26 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll index 243f0ed..f8655a7 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll @@ -256,7 +256,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GCN-NEXT: s_wait_storecnt 0x0 ; GCN-NEXT: .LBB5_3: ; %bb4 ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GCN-NEXT: s_wait_xcnt 0x0 ; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 63 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir index 029aa39..ce1ea4d 100644 --- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir +++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir @@ -128,13 +128,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vreg_64 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0 %2.sub2_sub3:areg_128 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -153,13 +153,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vreg_64 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0 %2.sub2_sub3:areg_128_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -398,14 +398,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_128 = COPY %0 %1.sub1:areg_128 = COPY %0 %1.sub2:areg_128 = COPY %0 %1.sub3:areg_128 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %1 SI_RETURN ... @@ -425,14 +425,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_128_align2 = COPY %0 %1.sub1:areg_128_align2 = COPY %0 %1.sub2:areg_128_align2 = COPY %0 %1.sub3:areg_128_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %1 SI_RETURN ... @@ -641,13 +641,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -668,13 +668,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1 %0.sub1:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0 %2.sub2_sub3:areg_128_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -890,14 +890,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_128 = COPY %0.sub0 %1.sub1:areg_128 = COPY %0.sub0 %1.sub2:areg_128 = COPY %0.sub0 %1.sub3:areg_128 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %1 SI_RETURN ... @@ -917,14 +917,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_128_align2 = COPY %0.sub0 %1.sub1:areg_128_align2 = COPY %0.sub0 %1.sub2:areg_128_align2 = COPY %0.sub0 %1.sub3:areg_128_align2 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %1 SI_RETURN ... @@ -1051,13 +1051,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -1076,13 +1076,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -1358,11 +1358,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %2:areg_128 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -1379,11 +1379,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %2:areg_128_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir index 92836d8..63db24a 100644 --- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir +++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir @@ -486,7 +486,7 @@ body: | ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_ENDPGM 0 bb.0: S_NOP 0, implicit-def $agpr0 @@ -516,7 +516,7 @@ body: | S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 - INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2 + INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2 S_ENDPGM 0 ... @@ -1368,7 +1368,7 @@ body: | ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} @@ -1408,7 +1408,7 @@ body: | undef %2.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) early-clobber %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %2, 0, 0, 0, implicit $mode, implicit $exec early-clobber %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %4 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %4 S_CBRANCH_VCCNZ %bb.1, implicit $vcc S_BRANCH %bb.2 @@ -1726,7 +1726,7 @@ body: | ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} @@ -1763,7 +1763,7 @@ body: | undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %4 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %4 S_CBRANCH_VCCNZ %bb.1, implicit $vcc S_BRANCH %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index 9cbdc38..5b3e486 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9633802 /* regdef:SGPR_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10682378 /* regdef:SGPR_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9633801 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10682377 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9633802 /* regdef:SGPR_128 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10682378 /* regdef:SGPR_128 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9633801 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10682377 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) @@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() { define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7798793 /* reguse:VReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:VReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7995401 /* reguse:VReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:VReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() call void asm sideeffect "; use $0", "v"(i128 %val) @@ -47,16 +47,16 @@ define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8323082 /* regdef:AReg_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8847370 /* regdef:AReg_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8650762 /* regdef:AReg_128_Align2 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9568266 /* regdef:AReg_128_Align2 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() call void asm sideeffect "; use $0", "a"(i128 %val) diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 6509d80..f88b1bf 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -12,7 +12,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX908-NEXT: {{ $}} ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def %25 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def %25 ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %25 ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64 */, def %27 ; REGALLOC-GFX908-NEXT: SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) @@ -37,7 +37,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0 - ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) @@ -61,7 +61,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX90A-NEXT: {{ $}} ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %23 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def %23 ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %23 ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def %21 ; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY %21 @@ -80,7 +80,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; PEI-GFX90A-NEXT: {{ $}} ; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0 - ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir index d7b713a..0b4e662 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir @@ -19,7 +19,7 @@ body: | ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -30,7 +30,7 @@ body: | %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -172,7 +172,7 @@ body: | ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -183,7 +183,7 @@ body: | undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4.sub2_sub3 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -208,7 +208,7 @@ body: | ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -219,7 +219,7 @@ body: | undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub1:areg_128_align2 = COPY %4.sub2 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir index 57f611b..4c2ea2f 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir @@ -17,7 +17,7 @@ body: | ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -26,7 +26,7 @@ body: | %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... @@ -47,7 +47,7 @@ body: | ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -56,7 +56,7 @@ body: | %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... @@ -151,7 +151,7 @@ body: | ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1 ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -163,7 +163,7 @@ body: | %other_use:vreg_64_align2 = COPY %5.sub0_sub1 %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec undef %8.sub0_sub1:areg_128_align2 = COPY %6 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %8:areg_128_align2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %8:areg_128_align2 GLOBAL_STORE_DWORDX4 %0, %8, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -231,7 +231,7 @@ body: | ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -245,7 +245,7 @@ body: | %other_use1:vreg_64_align2 = COPY %4.sub2_sub3 %other_use2:vreg_64 = COPY %4.sub1_sub2 %6:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %6:areg_128_align2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %6:areg_128_align2 GLOBAL_STORE_DWORDX4 %0, %6, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir index af8b9e7..6fe99d8 100644 --- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir @@ -520,6 +520,7 @@ body: | ; GCN-NEXT: GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr4_vgpr5, 16, 0, implicit $exec ; GCN-NEXT: S_WAIT_KMCNT 0 ; GCN-NEXT: $sgpr2 = S_ADD_I32 $sgpr0, 100, implicit-def $scc + ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 20, implicit $exec $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4) $vgpr0 = V_MOV_B32_e32 1, implicit $exec @@ -921,7 +922,6 @@ body: | $vgpr2 = V_MOV_B32_e32 1, implicit $exec ... -# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0. --- name: wait_kmcnt_with_outstanding_vmem tracksRegLiveness: true @@ -937,6 +937,7 @@ body: | ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec ; GCN-NEXT: S_WAIT_KMCNT 0 ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2 + ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec @@ -944,7 +945,6 @@ body: | $vgpr0 = V_MOV_B32_e32 0, implicit $exec ... -# FIXME: Missing S_WAIT_XCNT before overwriting sgpr0. --- name: wait_loadcnt_with_outstanding_smem tracksRegLiveness: true @@ -960,6 +960,7 @@ body: | ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 ; GCN-NEXT: S_WAIT_LOADCNT 0 ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $sgpr0 = S_MOV_B32 0 $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 @@ -967,7 +968,6 @@ body: | $sgpr0 = S_MOV_B32 0 ... -# TODO: Unnecessary wait before overwriting vgpr0. --- name: overwrite_vgpr_after_smem tracksRegLiveness: true @@ -981,14 +981,12 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 - ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 $vgpr0 = V_MOV_B32_e32 0, implicit $exec ... -# TODO: Unnecessary wait before overwriting sgpr0. --- name: overwrite_sgpr_after_vmem tracksRegLiveness: true @@ -1002,7 +1000,6 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec - ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $sgpr0 = S_MOV_B32 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec diff --git a/llvm/test/CodeGen/ARM/and-mask-variable.ll b/llvm/test/CodeGen/ARM/and-mask-variable.ll new file mode 100644 index 0000000..0f84b76 --- /dev/null +++ b/llvm/test/CodeGen/ARM/and-mask-variable.ll @@ -0,0 +1,90 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s --check-prefix V7M +; RUN: llc -mtriple=armv7a-eabi %s -o - | FileCheck %s --check-prefix V7A +; RUN: llc -mtriple=thumbv7a-eabi %s -o - | FileCheck %s --check-prefix V7A-T +; RUN: llc -mtriple=armv6m-eabi %s -o - | FileCheck %s --check-prefix V6M + +define i32 @mask_pair(i32 %x, i32 %y) { +; V7M-LABEL: mask_pair: +; V7M: @ %bb.0: +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: mask_pair: +; V7A: @ %bb.0: +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: mask_pair: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: mask_pair: +; V6M: @ %bb.0: +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: lsls r0, r1 +; V6M-NEXT: bx lr + %shl = shl nsw i32 -1, %y + %and = and i32 %shl, %x + ret i32 %and +} + +define i64 @mask_pair_64(i64 %x, i64 %y) { +; V7M-LABEL: mask_pair_64: +; V7M: @ %bb.0: +; V7M-NEXT: mov.w r3, #-1 +; V7M-NEXT: lsl.w r12, r3, r2 +; V7M-NEXT: subs r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: movpl.w r12, #0 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl r3, r2 +; V7M-NEXT: and.w r0, r0, r12 +; V7M-NEXT: ands r1, r3 +; V7M-NEXT: bx lr +; +; V7A-LABEL: mask_pair_64: +; V7A: @ %bb.0: +; V7A-NEXT: subs r12, r2, #32 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: lsl r2, r3, r2 +; V7A-NEXT: lslpl r3, r3, r12 +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: and r1, r3, r1 +; V7A-NEXT: and r0, r2, r0 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: mask_pair_64: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: lsl.w r12, r3, r2 +; V7A-T-NEXT: subs r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl.w r12, #0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl r3, r2 +; V7A-T-NEXT: and.w r0, r0, r12 +; V7A-T-NEXT: ands r1, r3 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: mask_pair_64: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r7, lr} +; V6M-NEXT: push {r4, r5, r7, lr} +; V6M-NEXT: mov r4, r1 +; V6M-NEXT: mov r5, r0 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: ands r0, r5 +; V6M-NEXT: ands r1, r4 +; V6M-NEXT: pop {r4, r5, r7, pc} + %shl = shl nsw i64 -1, %y + %and = and i64 %shl, %x + ret i64 %and +} diff --git a/llvm/test/CodeGen/ARM/extract-bits.ll b/llvm/test/CodeGen/ARM/extract-bits.ll new file mode 100644 index 0000000..77deaa5 --- /dev/null +++ b/llvm/test/CodeGen/ARM/extract-bits.ll @@ -0,0 +1,4591 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s --check-prefix V7M +; RUN: llc -mtriple=armv7a-eabi %s -o - | FileCheck %s --check-prefix V7A +; RUN: llc -mtriple=thumbv7a-eabi %s -o - | FileCheck %s --check-prefix V7A-T +; RUN: llc -mtriple=armv6m-eabi %s -o - | FileCheck %s --check-prefix V6M + +; Patterns: +; a) (x >> start) & (1 << nbits) - 1 +; b) (x >> start) & ~(-1 << nbits) +; c) (x >> start) & (-1 >> (32 - y)) +; d) (x >> start) << (32 - y) >> (32 - y) +; are equivalent. + +; ---------------------------------------------------------------------------- ; +; Pattern a. 32-bit +; ---------------------------------------------------------------------------- ; + +define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { +; V7M-LABEL: bextr32_a0: +; V7M: @ %bb.0: +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: movs r1, #1 +; V7M-NEXT: lsls r1, r2 +; V7M-NEXT: subs r1, #1 +; V7M-NEXT: ands r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_a0: +; V7A: @ %bb.0: +; V7A-NEXT: mov r12, #1 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: add r2, r3, r12, lsl r2 +; V7A-NEXT: and r0, r2, r0, lsr r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_a0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: movs r1, #1 +; V7A-T-NEXT: lsls r1, r2 +; V7A-T-NEXT: subs r1, #1 +; V7A-T-NEXT: ands r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_a0: +; V6M: @ %bb.0: +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: movs r1, #1 +; V6M-NEXT: lsls r1, r2 +; V6M-NEXT: subs r1, r1, #1 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: bx lr + %shifted = lshr i32 %val, %numskipbits + %onebit = shl i32 1, %numlowbits + %mask = add nsw i32 %onebit, -1 + %masked = and i32 %mask, %shifted + ret i32 %masked +} + +define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { +; V7M-LABEL: bextr32_a0_arithmetic: +; V7M: @ %bb.0: +; V7M-NEXT: asrs r0, r1 +; V7M-NEXT: movs r1, #1 +; V7M-NEXT: lsls r1, r2 +; V7M-NEXT: subs r1, #1 +; V7M-NEXT: ands r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_a0_arithmetic: +; V7A: @ %bb.0: +; V7A-NEXT: mov r12, #1 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: add r2, r3, r12, lsl r2 +; V7A-NEXT: and r0, r2, r0, asr r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_a0_arithmetic: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: asrs r0, r1 +; V7A-T-NEXT: movs r1, #1 +; V7A-T-NEXT: lsls r1, r2 +; V7A-T-NEXT: subs r1, #1 +; V7A-T-NEXT: ands r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_a0_arithmetic: +; V6M: @ %bb.0: +; V6M-NEXT: asrs r0, r1 +; V6M-NEXT: movs r1, #1 +; V6M-NEXT: lsls r1, r2 +; V6M-NEXT: subs r1, r1, #1 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: bx lr + %shifted = ashr i32 %val, %numskipbits + %onebit = shl i32 1, %numlowbits + %mask = add nsw i32 %onebit, -1 + %masked = and i32 %mask, %shifted + ret i32 %masked +} + +define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { +; V7M-LABEL: bextr32_a1_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: movs r1, #1 +; V7M-NEXT: lsls r1, r2 +; V7M-NEXT: subs r1, #1 +; V7M-NEXT: ands r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_a1_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: mov r12, #1 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: add r2, r3, r12, lsl r2 +; V7A-NEXT: and r0, r2, r0, lsr r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_a1_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: movs r1, #1 +; V7A-T-NEXT: lsls r1, r2 +; V7A-T-NEXT: subs r1, #1 +; V7A-T-NEXT: ands r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_a1_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: movs r1, #1 +; V6M-NEXT: lsls r1, r2 +; V6M-NEXT: subs r1, r1, #1 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: bx lr + %skip = zext i8 %numskipbits to i32 + %shifted = lshr i32 %val, %skip + %conv = zext i8 %numlowbits to i32 + %onebit = shl i32 1, %conv + %mask = add nsw i32 %onebit, -1 + %masked = and i32 %mask, %shifted + ret i32 %masked +} + +define i32 @bextr32_a2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind { +; V7M-LABEL: bextr32_a2_load: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: movs r1, #1 +; V7M-NEXT: lsls r1, r2 +; V7M-NEXT: subs r1, #1 +; V7M-NEXT: ands r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_a2_load: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: mov r12, #1 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: add r2, r3, r12, lsl r2 +; V7A-NEXT: and r0, r2, r0, lsr r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_a2_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: movs r1, #1 +; V7A-T-NEXT: lsls r1, r2 +; V7A-T-NEXT: subs r1, #1 +; V7A-T-NEXT: ands r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_a2_load: +; V6M: @ %bb.0: +; V6M-NEXT: ldr r3, [r0] +; V6M-NEXT: lsrs r3, r1 +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: lsls r0, r2 +; V6M-NEXT: subs r0, r0, #1 +; V6M-NEXT: ands r0, r3 +; V6M-NEXT: bx lr + %val = load i32, ptr %w + %shifted = lshr i32 %val, %numskipbits + %onebit = shl i32 1, %numlowbits + %mask = add nsw i32 %onebit, -1 + %masked = and i32 %mask, %shifted + ret i32 %masked +} + +define i32 @bextr32_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { +; V7M-LABEL: bextr32_a3_load_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: movs r1, #1 +; V7M-NEXT: lsls r1, r2 +; V7M-NEXT: subs r1, #1 +; V7M-NEXT: ands r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_a3_load_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: mov r12, #1 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: add r2, r3, r12, lsl r2 +; V7A-NEXT: and r0, r2, r0, lsr r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_a3_load_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: movs r1, #1 +; V7A-T-NEXT: lsls r1, r2 +; V7A-T-NEXT: subs r1, #1 +; V7A-T-NEXT: ands r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_a3_load_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: ldr r3, [r0] +; V6M-NEXT: lsrs r3, r1 +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: lsls r0, r2 +; V6M-NEXT: subs r0, r0, #1 +; V6M-NEXT: ands r0, r3 +; V6M-NEXT: bx lr + %val = load i32, ptr %w + %skip = zext i8 %numskipbits to i32 + %shifted = lshr i32 %val, %skip + %conv = zext i8 %numlowbits to i32 + %onebit = shl i32 1, %conv + %mask = add nsw i32 %onebit, -1 + %masked = and i32 %mask, %shifted + ret i32 %masked +} + +define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { +; V7M-LABEL: bextr32_a4_commutative: +; V7M: @ %bb.0: +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: movs r1, #1 +; V7M-NEXT: lsls r1, r2 +; V7M-NEXT: subs r1, #1 +; V7M-NEXT: ands r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_a4_commutative: +; V7A: @ %bb.0: +; V7A-NEXT: mov r12, #1 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: add r2, r3, r12, lsl r2 +; V7A-NEXT: and r0, r2, r0, lsr r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_a4_commutative: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: movs r1, #1 +; V7A-T-NEXT: lsls r1, r2 +; V7A-T-NEXT: subs r1, #1 +; V7A-T-NEXT: ands r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_a4_commutative: +; V6M: @ %bb.0: +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: movs r1, #1 +; V6M-NEXT: lsls r1, r2 +; V6M-NEXT: subs r1, r1, #1 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: bx lr + %shifted = lshr i32 %val, %numskipbits + %onebit = shl i32 1, %numlowbits + %mask = add nsw i32 %onebit, -1 + %masked = and i32 %shifted, %mask ; swapped order + ret i32 %masked +} + +; 64-bit + +define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { +; V7M-LABEL: bextr64_a0: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r4, lr} +; V7M-NEXT: push {r4, lr} +; V7M-NEXT: ldr.w r12, [sp, #8] +; V7M-NEXT: mov.w lr, #1 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: rsb.w r4, r12, #32 +; V7M-NEXT: subs.w r3, r12, #32 +; V7M-NEXT: lsr.w r4, lr, r4 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r4, lr, r3 +; V7M-NEXT: lsl.w r3, lr, r12 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r3, #0 +; V7M-NEXT: subs r3, #1 +; V7M-NEXT: sbc r12, r4, #0 +; V7M-NEXT: rsb.w r4, r2, #32 +; V7M-NEXT: lsl.w r4, r1, r4 +; V7M-NEXT: orrs r0, r4 +; V7M-NEXT: subs.w r4, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r4 +; V7M-NEXT: lsr.w r1, r1, r2 +; V7M-NEXT: and.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: and.w r1, r1, r12 +; V7M-NEXT: pop {r4, pc} +; +; V7A-LABEL: bextr64_a0: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, lr} +; V7A-NEXT: push {r4, lr} +; V7A-NEXT: ldr r12, [sp, #8] +; V7A-NEXT: mov lr, #1 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: rsb r3, r12, #32 +; V7A-NEXT: subs r4, r12, #32 +; V7A-NEXT: lsr r3, lr, r3 +; V7A-NEXT: lslpl r3, lr, r4 +; V7A-NEXT: lsl r4, lr, r12 +; V7A-NEXT: movwpl r4, #0 +; V7A-NEXT: subs r4, r4, #1 +; V7A-NEXT: sbc r12, r3, #0 +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: orr r0, r0, r1, lsl r3 +; V7A-NEXT: subs r3, r2, #32 +; V7A-NEXT: lsrpl r0, r1, r3 +; V7A-NEXT: lsr r1, r1, r2 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: and r0, r4, r0 +; V7A-NEXT: and r1, r12, r1 +; V7A-NEXT: pop {r4, pc} +; +; V7A-T-LABEL: bextr64_a0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r4, lr} +; V7A-T-NEXT: push {r4, lr} +; V7A-T-NEXT: ldr.w r12, [sp, #8] +; V7A-T-NEXT: mov.w lr, #1 +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: rsb.w r4, r12, #32 +; V7A-T-NEXT: subs.w r3, r12, #32 +; V7A-T-NEXT: lsr.w r4, lr, r4 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r4, lr, r3 +; V7A-T-NEXT: lsl.w r3, lr, r12 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r3, #0 +; V7A-T-NEXT: subs r3, #1 +; V7A-T-NEXT: sbc r12, r4, #0 +; V7A-T-NEXT: rsb.w r4, r2, #32 +; V7A-T-NEXT: lsl.w r4, r1, r4 +; V7A-T-NEXT: orrs r0, r4 +; V7A-T-NEXT: subs.w r4, r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r4 +; V7A-T-NEXT: lsr.w r1, r1, r2 +; V7A-T-NEXT: and.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: and.w r1, r1, r12 +; V7A-T-NEXT: pop {r4, pc} +; +; V6M-LABEL: bextr64_a0: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r6, r7, lr} +; V6M-NEXT: push {r4, r5, r6, r7, lr} +; V6M-NEXT: .pad #12 +; V6M-NEXT: sub sp, #12 +; V6M-NEXT: str r2, [sp, #8] @ 4-byte Spill +; V6M-NEXT: str r1, [sp, #4] @ 4-byte Spill +; V6M-NEXT: mov r6, r0 +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r7, #0 +; V6M-NEXT: ldr r2, [sp, #32] +; V6M-NEXT: mov r1, r7 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: mov r4, r1 +; V6M-NEXT: subs r5, r0, #1 +; V6M-NEXT: sbcs r4, r7 +; V6M-NEXT: mov r0, r6 +; V6M-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; V6M-NEXT: ldr r2, [sp, #8] @ 4-byte Reload +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ands r0, r5 +; V6M-NEXT: ands r1, r4 +; V6M-NEXT: add sp, #12 +; V6M-NEXT: pop {r4, r5, r6, r7, pc} + %shifted = lshr i64 %val, %numskipbits + %onebit = shl i64 1, %numlowbits + %mask = add nsw i64 %onebit, -1 + %masked = and i64 %mask, %shifted + ret i64 %masked +} + +define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { +; V7M-LABEL: bextr64_a0_arithmetic: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r4, lr} +; V7M-NEXT: push {r4, lr} +; V7M-NEXT: ldr.w r12, [sp, #8] +; V7M-NEXT: mov.w lr, #1 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: rsb.w r4, r12, #32 +; V7M-NEXT: subs.w r3, r12, #32 +; V7M-NEXT: lsr.w r4, lr, r4 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r4, lr, r3 +; V7M-NEXT: lsl.w r3, lr, r12 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r3, #0 +; V7M-NEXT: subs r3, #1 +; V7M-NEXT: sbc r12, r4, #0 +; V7M-NEXT: rsb.w r4, r2, #32 +; V7M-NEXT: lsl.w r4, r1, r4 +; V7M-NEXT: orrs r0, r4 +; V7M-NEXT: subs.w r4, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: asrpl.w r0, r1, r4 +; V7M-NEXT: asr.w r2, r1, r2 +; V7M-NEXT: and.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: asrpl r2, r1, #31 +; V7M-NEXT: and.w r1, r12, r2 +; V7M-NEXT: pop {r4, pc} +; +; V7A-LABEL: bextr64_a0_arithmetic: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, lr} +; V7A-NEXT: push {r4, lr} +; V7A-NEXT: ldr r12, [sp, #8] +; V7A-NEXT: mov lr, #1 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: rsb r3, r12, #32 +; V7A-NEXT: subs r4, r12, #32 +; V7A-NEXT: lsr r3, lr, r3 +; V7A-NEXT: lslpl r3, lr, r4 +; V7A-NEXT: lsl r4, lr, r12 +; V7A-NEXT: movwpl r4, #0 +; V7A-NEXT: subs r4, r4, #1 +; V7A-NEXT: sbc r12, r3, #0 +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: orr r0, r0, r1, lsl r3 +; V7A-NEXT: subs r3, r2, #32 +; V7A-NEXT: asr r2, r1, r2 +; V7A-NEXT: asrpl r0, r1, r3 +; V7A-NEXT: asrpl r2, r1, #31 +; V7A-NEXT: and r0, r4, r0 +; V7A-NEXT: and r1, r12, r2 +; V7A-NEXT: pop {r4, pc} +; +; V7A-T-LABEL: bextr64_a0_arithmetic: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r4, lr} +; V7A-T-NEXT: push {r4, lr} +; V7A-T-NEXT: ldr.w r12, [sp, #8] +; V7A-T-NEXT: mov.w lr, #1 +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: rsb.w r4, r12, #32 +; V7A-T-NEXT: subs.w r3, r12, #32 +; V7A-T-NEXT: lsr.w r4, lr, r4 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r4, lr, r3 +; V7A-T-NEXT: lsl.w r3, lr, r12 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r3, #0 +; V7A-T-NEXT: subs r3, #1 +; V7A-T-NEXT: sbc r12, r4, #0 +; V7A-T-NEXT: rsb.w r4, r2, #32 +; V7A-T-NEXT: lsl.w r4, r1, r4 +; V7A-T-NEXT: orrs r0, r4 +; V7A-T-NEXT: subs.w r4, r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: asrpl.w r0, r1, r4 +; V7A-T-NEXT: asr.w r2, r1, r2 +; V7A-T-NEXT: and.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: asrpl r2, r1, #31 +; V7A-T-NEXT: and.w r1, r12, r2 +; V7A-T-NEXT: pop {r4, pc} +; +; V6M-LABEL: bextr64_a0_arithmetic: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r6, r7, lr} +; V6M-NEXT: push {r4, r5, r6, r7, lr} +; V6M-NEXT: .pad #12 +; V6M-NEXT: sub sp, #12 +; V6M-NEXT: str r2, [sp, #8] @ 4-byte Spill +; V6M-NEXT: str r1, [sp, #4] @ 4-byte Spill +; V6M-NEXT: mov r6, r0 +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r7, #0 +; V6M-NEXT: ldr r2, [sp, #32] +; V6M-NEXT: mov r1, r7 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: mov r4, r1 +; V6M-NEXT: subs r5, r0, #1 +; V6M-NEXT: sbcs r4, r7 +; V6M-NEXT: mov r0, r6 +; V6M-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; V6M-NEXT: ldr r2, [sp, #8] @ 4-byte Reload +; V6M-NEXT: bl __aeabi_lasr +; V6M-NEXT: ands r0, r5 +; V6M-NEXT: ands r1, r4 +; V6M-NEXT: add sp, #12 +; V6M-NEXT: pop {r4, r5, r6, r7, pc} + %shifted = ashr i64 %val, %numskipbits + %onebit = shl i64 1, %numlowbits + %mask = add nsw i64 %onebit, -1 + %masked = and i64 %mask, %shifted + ret i64 %masked +} + +define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { +; V7M-LABEL: bextr64_a1_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r4, lr} +; V7M-NEXT: push {r4, lr} +; V7M-NEXT: rsb.w r4, r3, #32 +; V7M-NEXT: mov.w lr, #1 +; V7M-NEXT: subs.w r12, r3, #32 +; V7M-NEXT: lsl.w r3, lr, r3 +; V7M-NEXT: lsr.w r4, lr, r4 +; V7M-NEXT: lsr.w r0, r0, r2 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r4, lr, r12 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r3, #0 +; V7M-NEXT: subs r3, #1 +; V7M-NEXT: sbc r12, r4, #0 +; V7M-NEXT: rsb.w r4, r2, #32 +; V7M-NEXT: lsl.w r4, r1, r4 +; V7M-NEXT: orrs r0, r4 +; V7M-NEXT: subs.w r4, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r4 +; V7M-NEXT: lsr.w r1, r1, r2 +; V7M-NEXT: and.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: and.w r1, r1, r12 +; V7M-NEXT: pop {r4, pc} +; +; V7A-LABEL: bextr64_a1_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, lr} +; V7A-NEXT: push {r4, lr} +; V7A-NEXT: rsb r12, r3, #32 +; V7A-NEXT: mov lr, #1 +; V7A-NEXT: subs r4, r3, #32 +; V7A-NEXT: lsl r3, lr, r3 +; V7A-NEXT: lsr r12, lr, r12 +; V7A-NEXT: movwpl r3, #0 +; V7A-NEXT: lslpl r12, lr, r4 +; V7A-NEXT: rsb r4, r2, #32 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: subs r3, r3, #1 +; V7A-NEXT: sbc r12, r12, #0 +; V7A-NEXT: orr r0, r0, r1, lsl r4 +; V7A-NEXT: subs r4, r2, #32 +; V7A-NEXT: lsrpl r0, r1, r4 +; V7A-NEXT: lsr r1, r1, r2 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: and r0, r3, r0 +; V7A-NEXT: and r1, r12, r1 +; V7A-NEXT: pop {r4, pc} +; +; V7A-T-LABEL: bextr64_a1_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r4, lr} +; V7A-T-NEXT: push {r4, lr} +; V7A-T-NEXT: rsb.w r4, r3, #32 +; V7A-T-NEXT: mov.w lr, #1 +; V7A-T-NEXT: subs.w r12, r3, #32 +; V7A-T-NEXT: lsl.w r3, lr, r3 +; V7A-T-NEXT: lsr.w r4, lr, r4 +; V7A-T-NEXT: lsr.w r0, r0, r2 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r4, lr, r12 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r3, #0 +; V7A-T-NEXT: subs r3, #1 +; V7A-T-NEXT: sbc r12, r4, #0 +; V7A-T-NEXT: rsb.w r4, r2, #32 +; V7A-T-NEXT: lsl.w r4, r1, r4 +; V7A-T-NEXT: orrs r0, r4 +; V7A-T-NEXT: subs.w r4, r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r4 +; V7A-T-NEXT: lsr.w r1, r1, r2 +; V7A-T-NEXT: and.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: and.w r1, r1, r12 +; V7A-T-NEXT: pop {r4, pc} +; +; V6M-LABEL: bextr64_a1_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r6, r7, lr} +; V6M-NEXT: push {r4, r5, r6, r7, lr} +; V6M-NEXT: .pad #12 +; V6M-NEXT: sub sp, #12 +; V6M-NEXT: str r2, [sp, #8] @ 4-byte Spill +; V6M-NEXT: str r1, [sp, #4] @ 4-byte Spill +; V6M-NEXT: mov r6, r0 +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r7, #0 +; V6M-NEXT: mov r1, r7 +; V6M-NEXT: mov r2, r3 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: mov r4, r1 +; V6M-NEXT: subs r5, r0, #1 +; V6M-NEXT: sbcs r4, r7 +; V6M-NEXT: mov r0, r6 +; V6M-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; V6M-NEXT: ldr r2, [sp, #8] @ 4-byte Reload +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ands r0, r5 +; V6M-NEXT: ands r1, r4 +; V6M-NEXT: add sp, #12 +; V6M-NEXT: pop {r4, r5, r6, r7, pc} + %skip = zext i8 %numskipbits to i64 + %shifted = lshr i64 %val, %skip + %conv = zext i8 %numlowbits to i64 + %onebit = shl i64 1, %conv + %mask = add nsw i64 %onebit, -1 + %masked = and i64 %mask, %shifted + ret i64 %masked +} + +define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind { +; V7M-LABEL: bextr64_a2_load: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: ldr.w r12, [sp, #8] +; V7M-NEXT: mov.w lr, #1 +; V7M-NEXT: rsb.w r1, r12, #32 +; V7M-NEXT: subs.w r3, r12, #32 +; V7M-NEXT: lsr.w r1, lr, r1 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r1, lr, r3 +; V7M-NEXT: lsl.w r3, lr, r12 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r3, #0 +; V7M-NEXT: subs.w lr, r3, #1 +; V7M-NEXT: ldrd r0, r3, [r0] +; V7M-NEXT: sbc r12, r1, #0 +; V7M-NEXT: rsb.w r1, r2, #32 +; V7M-NEXT: lsl.w r1, r3, r1 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: orrs r0, r1 +; V7M-NEXT: subs.w r1, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r3, r1 +; V7M-NEXT: lsr.w r1, r3, r2 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: and.w r0, r0, lr +; V7M-NEXT: and.w r1, r1, r12 +; V7M-NEXT: pop {r7, pc} +; +; V7A-LABEL: bextr64_a2_load: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, r5, r6, lr} +; V7A-NEXT: push {r4, r5, r6, lr} +; V7A-NEXT: ldr r1, [sp, #16] +; V7A-NEXT: mov r3, #1 +; V7A-NEXT: ldr r6, [r0] +; V7A-NEXT: ldr r5, [r0, #4] +; V7A-NEXT: rsb r0, r1, #32 +; V7A-NEXT: subs r4, r1, #32 +; V7A-NEXT: lsl r1, r3, r1 +; V7A-NEXT: lsr r0, r3, r0 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: lslpl r0, r3, r4 +; V7A-NEXT: subs r1, r1, #1 +; V7A-NEXT: sbc r3, r0, #0 +; V7A-NEXT: lsr r0, r6, r2 +; V7A-NEXT: rsb r6, r2, #32 +; V7A-NEXT: orr r0, r0, r5, lsl r6 +; V7A-NEXT: subs r6, r2, #32 +; V7A-NEXT: lsrpl r0, r5, r6 +; V7A-NEXT: and r0, r1, r0 +; V7A-NEXT: lsr r1, r5, r2 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: and r1, r3, r1 +; V7A-NEXT: pop {r4, r5, r6, pc} +; +; V7A-T-LABEL: bextr64_a2_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r4, lr} +; V7A-T-NEXT: push {r4, lr} +; V7A-T-NEXT: ldr.w r12, [sp, #8] +; V7A-T-NEXT: movs r3, #1 +; V7A-T-NEXT: ldrd lr, r1, [r0] +; V7A-T-NEXT: rsb.w r4, r12, #32 +; V7A-T-NEXT: subs.w r0, r12, #32 +; V7A-T-NEXT: lsr.w r4, r3, r4 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r4, r3, r0 +; V7A-T-NEXT: lsl.w r0, r3, r12 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: lsr.w r3, lr, r2 +; V7A-T-NEXT: subs r0, #1 +; V7A-T-NEXT: sbc r12, r4, #0 +; V7A-T-NEXT: rsb.w r4, r2, #32 +; V7A-T-NEXT: lsl.w r4, r1, r4 +; V7A-T-NEXT: orrs r3, r4 +; V7A-T-NEXT: subs.w r4, r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r3, r1, r4 +; V7A-T-NEXT: lsr.w r1, r1, r2 +; V7A-T-NEXT: and.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: and.w r1, r1, r12 +; V7A-T-NEXT: pop {r4, pc} +; +; V6M-LABEL: bextr64_a2_load: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r6, r7, lr} +; V6M-NEXT: push {r4, r5, r6, r7, lr} +; V6M-NEXT: .pad #4 +; V6M-NEXT: sub sp, #4 +; V6M-NEXT: str r2, [sp] @ 4-byte Spill +; V6M-NEXT: mov r5, r0 +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r7, #0 +; V6M-NEXT: ldr r2, [sp, #24] +; V6M-NEXT: mov r1, r7 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: mov r6, r1 +; V6M-NEXT: subs r4, r0, #1 +; V6M-NEXT: sbcs r6, r7 +; V6M-NEXT: ldm r5!, {r0, r1} +; V6M-NEXT: ldr r2, [sp] @ 4-byte Reload +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ands r0, r4 +; V6M-NEXT: ands r1, r6 +; V6M-NEXT: add sp, #4 +; V6M-NEXT: pop {r4, r5, r6, r7, pc} + %val = load i64, ptr %w + %shifted = lshr i64 %val, %numskipbits + %onebit = shl i64 1, %numlowbits + %mask = add nsw i64 %onebit, -1 + %masked = and i64 %mask, %shifted + ret i64 %masked +} + +define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { +; V7M-LABEL: bextr64_a3_load_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: mov.w r12, #1 +; V7M-NEXT: subs.w lr, r2, #32 +; V7M-NEXT: lsl.w r2, r12, r2 +; V7M-NEXT: lsr.w r3, r12, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r3, r12, lr +; V7M-NEXT: it pl +; V7M-NEXT: movpl r2, #0 +; V7M-NEXT: subs.w lr, r2, #1 +; V7M-NEXT: ldrd r0, r2, [r0] +; V7M-NEXT: sbc r12, r3, #0 +; V7M-NEXT: rsb.w r3, r1, #32 +; V7M-NEXT: lsl.w r3, r2, r3 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: orrs r0, r3 +; V7M-NEXT: subs.w r3, r1, #32 +; V7M-NEXT: lsr.w r1, r2, r1 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r2, r3 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: and.w r0, r0, lr +; V7M-NEXT: and.w r1, r1, r12 +; V7M-NEXT: pop {r7, pc} +; +; V7A-LABEL: bextr64_a3_load_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, r5, r6, lr} +; V7A-NEXT: push {r4, r5, r6, lr} +; V7A-NEXT: ldr r6, [r0] +; V7A-NEXT: mov r3, #1 +; V7A-NEXT: ldr r5, [r0, #4] +; V7A-NEXT: rsb r0, r2, #32 +; V7A-NEXT: subs r4, r2, #32 +; V7A-NEXT: lsl r2, r3, r2 +; V7A-NEXT: lsr r0, r3, r0 +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: lslpl r0, r3, r4 +; V7A-NEXT: subs r3, r2, #1 +; V7A-NEXT: sbc r0, r0, #0 +; V7A-NEXT: lsr r2, r5, r1 +; V7A-NEXT: subs r4, r1, #32 +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: and r2, r0, r2 +; V7A-NEXT: lsr r0, r6, r1 +; V7A-NEXT: rsb r1, r1, #32 +; V7A-NEXT: orr r0, r0, r5, lsl r1 +; V7A-NEXT: mov r1, r2 +; V7A-NEXT: lsrpl r0, r5, r4 +; V7A-NEXT: and r0, r3, r0 +; V7A-NEXT: pop {r4, r5, r6, pc} +; +; V7A-T-LABEL: bextr64_a3_load_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r4, lr} +; V7A-T-NEXT: push {r4, lr} +; V7A-T-NEXT: rsb.w r4, r2, #32 +; V7A-T-NEXT: mov.w lr, #1 +; V7A-T-NEXT: subs.w r3, r2, #32 +; V7A-T-NEXT: lsl.w r2, lr, r2 +; V7A-T-NEXT: lsr.w r4, lr, r4 +; V7A-T-NEXT: ldrd r12, r0, [r0] +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r4, lr, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r2, #0 +; V7A-T-NEXT: subs.w lr, r2, #1 +; V7A-T-NEXT: sbc r2, r4, #0 +; V7A-T-NEXT: lsr.w r4, r0, r1 +; V7A-T-NEXT: subs.w r3, r1, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r4, #0 +; V7A-T-NEXT: and.w r2, r2, r4 +; V7A-T-NEXT: rsb.w r4, r1, #32 +; V7A-T-NEXT: lsr.w r1, r12, r1 +; V7A-T-NEXT: lsl.w r4, r0, r4 +; V7A-T-NEXT: orr.w r1, r1, r4 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r1, r0, r3 +; V7A-T-NEXT: and.w r0, lr, r1 +; V7A-T-NEXT: mov r1, r2 +; V7A-T-NEXT: pop {r4, pc} +; +; V6M-LABEL: bextr64_a3_load_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r6, r7, lr} +; V6M-NEXT: push {r4, r5, r6, r7, lr} +; V6M-NEXT: .pad #4 +; V6M-NEXT: sub sp, #4 +; V6M-NEXT: str r1, [sp] @ 4-byte Spill +; V6M-NEXT: mov r6, r0 +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r7, #0 +; V6M-NEXT: mov r1, r7 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: mov r5, r1 +; V6M-NEXT: subs r4, r0, #1 +; V6M-NEXT: sbcs r5, r7 +; V6M-NEXT: ldm r6!, {r0, r1} +; V6M-NEXT: ldr r2, [sp] @ 4-byte Reload +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ands r0, r4 +; V6M-NEXT: ands r1, r5 +; V6M-NEXT: add sp, #4 +; V6M-NEXT: pop {r4, r5, r6, r7, pc} + %val = load i64, ptr %w + %skip = zext i8 %numskipbits to i64 + %shifted = lshr i64 %val, %skip + %conv = zext i8 %numlowbits to i64 + %onebit = shl i64 1, %conv + %mask = add nsw i64 %onebit, -1 + %masked = and i64 %mask, %shifted + ret i64 %masked +} + +define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { +; V7M-LABEL: bextr64_a4_commutative: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r4, lr} +; V7M-NEXT: push {r4, lr} +; V7M-NEXT: ldr.w r12, [sp, #8] +; V7M-NEXT: mov.w lr, #1 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: rsb.w r4, r12, #32 +; V7M-NEXT: subs.w r3, r12, #32 +; V7M-NEXT: lsr.w r4, lr, r4 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r4, lr, r3 +; V7M-NEXT: lsl.w r3, lr, r12 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r3, #0 +; V7M-NEXT: subs r3, #1 +; V7M-NEXT: sbc r12, r4, #0 +; V7M-NEXT: rsb.w r4, r2, #32 +; V7M-NEXT: lsl.w r4, r1, r4 +; V7M-NEXT: orrs r0, r4 +; V7M-NEXT: subs.w r4, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r4 +; V7M-NEXT: lsr.w r1, r1, r2 +; V7M-NEXT: and.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: and.w r1, r1, r12 +; V7M-NEXT: pop {r4, pc} +; +; V7A-LABEL: bextr64_a4_commutative: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, lr} +; V7A-NEXT: push {r4, lr} +; V7A-NEXT: ldr r12, [sp, #8] +; V7A-NEXT: mov lr, #1 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: rsb r3, r12, #32 +; V7A-NEXT: subs r4, r12, #32 +; V7A-NEXT: lsr r3, lr, r3 +; V7A-NEXT: lslpl r3, lr, r4 +; V7A-NEXT: lsl r4, lr, r12 +; V7A-NEXT: movwpl r4, #0 +; V7A-NEXT: subs r4, r4, #1 +; V7A-NEXT: sbc r12, r3, #0 +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: orr r0, r0, r1, lsl r3 +; V7A-NEXT: subs r3, r2, #32 +; V7A-NEXT: lsrpl r0, r1, r3 +; V7A-NEXT: lsr r1, r1, r2 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: and r0, r0, r4 +; V7A-NEXT: and r1, r1, r12 +; V7A-NEXT: pop {r4, pc} +; +; V7A-T-LABEL: bextr64_a4_commutative: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r4, lr} +; V7A-T-NEXT: push {r4, lr} +; V7A-T-NEXT: ldr.w r12, [sp, #8] +; V7A-T-NEXT: mov.w lr, #1 +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: rsb.w r4, r12, #32 +; V7A-T-NEXT: subs.w r3, r12, #32 +; V7A-T-NEXT: lsr.w r4, lr, r4 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r4, lr, r3 +; V7A-T-NEXT: lsl.w r3, lr, r12 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r3, #0 +; V7A-T-NEXT: subs r3, #1 +; V7A-T-NEXT: sbc r12, r4, #0 +; V7A-T-NEXT: rsb.w r4, r2, #32 +; V7A-T-NEXT: lsl.w r4, r1, r4 +; V7A-T-NEXT: orrs r0, r4 +; V7A-T-NEXT: subs.w r4, r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r4 +; V7A-T-NEXT: lsr.w r1, r1, r2 +; V7A-T-NEXT: and.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: and.w r1, r1, r12 +; V7A-T-NEXT: pop {r4, pc} +; +; V6M-LABEL: bextr64_a4_commutative: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r6, r7, lr} +; V6M-NEXT: push {r4, r5, r6, r7, lr} +; V6M-NEXT: .pad #12 +; V6M-NEXT: sub sp, #12 +; V6M-NEXT: str r2, [sp, #8] @ 4-byte Spill +; V6M-NEXT: str r1, [sp, #4] @ 4-byte Spill +; V6M-NEXT: mov r6, r0 +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r7, #0 +; V6M-NEXT: ldr r2, [sp, #32] +; V6M-NEXT: mov r1, r7 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: mov r4, r1 +; V6M-NEXT: subs r5, r0, #1 +; V6M-NEXT: sbcs r4, r7 +; V6M-NEXT: mov r0, r6 +; V6M-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; V6M-NEXT: ldr r2, [sp, #8] @ 4-byte Reload +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ands r0, r5 +; V6M-NEXT: ands r1, r4 +; V6M-NEXT: add sp, #12 +; V6M-NEXT: pop {r4, r5, r6, r7, pc} + %shifted = lshr i64 %val, %numskipbits + %onebit = shl i64 1, %numlowbits + %mask = add nsw i64 %onebit, -1 + %masked = and i64 %shifted, %mask ; swapped order + ret i64 %masked +} + +; 64-bit, but with 32-bit output + +; Everything done in 64-bit, truncation happens last. +define i32 @bextr64_32_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { +; V7M-LABEL: bextr64_32_a0: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: subs r2, #32 +; V7M-NEXT: lsl.w r3, r1, r3 +; V7M-NEXT: orr.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r2 +; V7M-NEXT: ldr r1, [sp] +; V7M-NEXT: movs r2, #1 +; V7M-NEXT: lsls r2, r1 +; V7M-NEXT: subs r1, #32 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r2, #0 +; V7M-NEXT: subs r1, r2, #1 +; V7M-NEXT: ands r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr64_32_a0: +; V7A: @ %bb.0: +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: ldr r12, [sp] +; V7A-NEXT: subs r2, r2, #32 +; V7A-NEXT: orr r0, r0, r1, lsl r3 +; V7A-NEXT: lsrpl r0, r1, r2 +; V7A-NEXT: mov r1, #1 +; V7A-NEXT: lsl r1, r1, r12 +; V7A-NEXT: subs r2, r12, #32 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: sub r1, r1, #1 +; V7A-NEXT: and r0, r1, r0 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr64_32_a0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: ldr.w r12, [sp] +; V7A-T-NEXT: subs r2, #32 +; V7A-T-NEXT: lsl.w r3, r1, r3 +; V7A-T-NEXT: orr.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r2 +; V7A-T-NEXT: movs r1, #1 +; V7A-T-NEXT: lsl.w r1, r1, r12 +; V7A-T-NEXT: subs.w r2, r12, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: subs r1, #1 +; V7A-T-NEXT: ands r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr64_32_a0: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: mov r4, r0 +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r1, #0 +; V6M-NEXT: ldr r2, [sp, #8] +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: subs r0, r0, #1 +; V6M-NEXT: ands r0, r4 +; V6M-NEXT: pop {r4, pc} + %shifted = lshr i64 %val, %numskipbits + %onebit = shl i64 1, %numlowbits + %mask = add nsw i64 %onebit, -1 + %masked = and i64 %mask, %shifted + %res = trunc i64 %masked to i32 + ret i32 %res +} + +; Shifting happens in 64-bit, then truncation. Masking is 32-bit. +define i32 @bextr64_32_a1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind { +; V7M-LABEL: bextr64_32_a1: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: subs r2, #32 +; V7M-NEXT: lsl.w r3, r1, r3 +; V7M-NEXT: orr.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r2 +; V7M-NEXT: ldr r1, [sp] +; V7M-NEXT: movs r2, #1 +; V7M-NEXT: lsl.w r1, r2, r1 +; V7M-NEXT: subs r1, #1 +; V7M-NEXT: ands r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr64_32_a1: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r11, lr} +; V7A-NEXT: push {r11, lr} +; V7A-NEXT: ldr r12, [sp, #8] +; V7A-NEXT: mov lr, #1 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: add r12, r3, lr, lsl r12 +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: subs r2, r2, #32 +; V7A-NEXT: orr r0, r0, r1, lsl r3 +; V7A-NEXT: lsrpl r0, r1, r2 +; V7A-NEXT: and r0, r12, r0 +; V7A-NEXT: pop {r11, pc} +; +; V7A-T-LABEL: bextr64_32_a1: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: ldr.w r12, [sp] +; V7A-T-NEXT: subs r2, #32 +; V7A-T-NEXT: lsl.w r3, r1, r3 +; V7A-T-NEXT: orr.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r2 +; V7A-T-NEXT: movs r1, #1 +; V7A-T-NEXT: lsl.w r1, r1, r12 +; V7A-T-NEXT: subs r1, #1 +; V7A-T-NEXT: ands r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr64_32_a1: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r7, lr} +; V6M-NEXT: push {r7, lr} +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ldr r1, [sp, #8] +; V6M-NEXT: movs r2, #1 +; V6M-NEXT: lsls r2, r1 +; V6M-NEXT: subs r1, r2, #1 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: pop {r7, pc} + %shifted = lshr i64 %val, %numskipbits + %truncshifted = trunc i64 %shifted to i32 + %onebit = shl i32 1, %numlowbits + %mask = add nsw i32 %onebit, -1 + %masked = and i32 %mask, %truncshifted + ret i32 %masked +} + +; Shifting happens in 64-bit. Mask is 32-bit, but extended to 64-bit. +; Masking is 64-bit. Then truncation. +define i32 @bextr64_32_a2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind { +; V7M-LABEL: bextr64_32_a2: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: subs r2, #32 +; V7M-NEXT: lsl.w r3, r1, r3 +; V7M-NEXT: orr.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r2 +; V7M-NEXT: ldr r1, [sp] +; V7M-NEXT: movs r2, #1 +; V7M-NEXT: lsl.w r1, r2, r1 +; V7M-NEXT: subs r1, #1 +; V7M-NEXT: ands r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr64_32_a2: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r11, lr} +; V7A-NEXT: push {r11, lr} +; V7A-NEXT: ldr r12, [sp, #8] +; V7A-NEXT: mov lr, #1 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: add r12, r3, lr, lsl r12 +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: subs r2, r2, #32 +; V7A-NEXT: orr r0, r0, r1, lsl r3 +; V7A-NEXT: lsrpl r0, r1, r2 +; V7A-NEXT: and r0, r12, r0 +; V7A-NEXT: pop {r11, pc} +; +; V7A-T-LABEL: bextr64_32_a2: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: ldr.w r12, [sp] +; V7A-T-NEXT: subs r2, #32 +; V7A-T-NEXT: lsl.w r3, r1, r3 +; V7A-T-NEXT: orr.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r2 +; V7A-T-NEXT: movs r1, #1 +; V7A-T-NEXT: lsl.w r1, r1, r12 +; V7A-T-NEXT: subs r1, #1 +; V7A-T-NEXT: ands r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr64_32_a2: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r7, lr} +; V6M-NEXT: push {r7, lr} +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ldr r1, [sp, #8] +; V6M-NEXT: movs r2, #1 +; V6M-NEXT: lsls r2, r1 +; V6M-NEXT: subs r1, r2, #1 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: pop {r7, pc} + %shifted = lshr i64 %val, %numskipbits + %onebit = shl i32 1, %numlowbits + %mask = add nsw i32 %onebit, -1 + %zextmask = zext i32 %mask to i64 + %masked = and i64 %zextmask, %shifted + %truncmasked = trunc i64 %masked to i32 + ret i32 %truncmasked +} + +; ---------------------------------------------------------------------------- ; +; Pattern b. 32-bit +; ---------------------------------------------------------------------------- ; + +define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { +; V7M-LABEL: bextr32_b0: +; V7M: @ %bb.0: +; V7M-NEXT: mov.w r3, #-1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: lsl.w r2, r3, r2 +; V7M-NEXT: bics r0, r2 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_b0: +; V7A: @ %bb.0: +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: mvn r1, #0 +; V7A-NEXT: bic r0, r0, r1, lsl r2 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_b0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: lsl.w r2, r3, r2 +; V7A-T-NEXT: bics r0, r2 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_b0: +; V6M: @ %bb.0: +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: movs r1, #0 +; V6M-NEXT: mvns r1, r1 +; V6M-NEXT: lsls r1, r2 +; V6M-NEXT: bics r0, r1 +; V6M-NEXT: bx lr + %shifted = lshr i32 %val, %numskipbits + %notmask = shl i32 -1, %numlowbits + %mask = xor i32 %notmask, -1 + %masked = and i32 %mask, %shifted + ret i32 %masked +} + +define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { +; V7M-LABEL: bextr32_b1_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: mov.w r3, #-1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: lsl.w r2, r3, r2 +; V7M-NEXT: bics r0, r2 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_b1_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: mvn r1, #0 +; V7A-NEXT: bic r0, r0, r1, lsl r2 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_b1_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: lsl.w r2, r3, r2 +; V7A-T-NEXT: bics r0, r2 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_b1_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: movs r1, #0 +; V6M-NEXT: mvns r1, r1 +; V6M-NEXT: lsls r1, r2 +; V6M-NEXT: bics r0, r1 +; V6M-NEXT: bx lr + %skip = zext i8 %numskipbits to i32 + %shifted = lshr i32 %val, %skip + %conv = zext i8 %numlowbits to i32 + %notmask = shl i32 -1, %conv + %mask = xor i32 %notmask, -1 + %masked = and i32 %mask, %shifted + ret i32 %masked +} + +define i32 @bextr32_b2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind { +; V7M-LABEL: bextr32_b2_load: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: mov.w r3, #-1 +; V7M-NEXT: lsl.w r2, r3, r2 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bics r0, r2 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_b2_load: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: mvn r1, #0 +; V7A-NEXT: bic r0, r0, r1, lsl r2 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_b2_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: lsl.w r2, r3, r2 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bics r0, r2 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_b2_load: +; V6M: @ %bb.0: +; V6M-NEXT: movs r3, #0 +; V6M-NEXT: mvns r3, r3 +; V6M-NEXT: lsls r3, r2 +; V6M-NEXT: ldr r0, [r0] +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: bics r0, r3 +; V6M-NEXT: bx lr + %val = load i32, ptr %w + %shifted = lshr i32 %val, %numskipbits + %notmask = shl i32 -1, %numlowbits + %mask = xor i32 %notmask, -1 + %masked = and i32 %mask, %shifted + ret i32 %masked +} + +define i32 @bextr32_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { +; V7M-LABEL: bextr32_b3_load_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: mov.w r3, #-1 +; V7M-NEXT: lsl.w r2, r3, r2 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bics r0, r2 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_b3_load_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: mvn r1, #0 +; V7A-NEXT: bic r0, r0, r1, lsl r2 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_b3_load_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: lsl.w r2, r3, r2 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bics r0, r2 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_b3_load_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: movs r3, #0 +; V6M-NEXT: mvns r3, r3 +; V6M-NEXT: lsls r3, r2 +; V6M-NEXT: ldr r0, [r0] +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: bics r0, r3 +; V6M-NEXT: bx lr + %val = load i32, ptr %w + %skip = zext i8 %numskipbits to i32 + %shifted = lshr i32 %val, %skip + %conv = zext i8 %numlowbits to i32 + %notmask = shl i32 -1, %conv + %mask = xor i32 %notmask, -1 + %masked = and i32 %mask, %shifted + ret i32 %masked +} + +define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { +; V7M-LABEL: bextr32_b4_commutative: +; V7M: @ %bb.0: +; V7M-NEXT: mov.w r3, #-1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: lsl.w r2, r3, r2 +; V7M-NEXT: bics r0, r2 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_b4_commutative: +; V7A: @ %bb.0: +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: mvn r1, #0 +; V7A-NEXT: bic r0, r0, r1, lsl r2 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_b4_commutative: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: lsl.w r2, r3, r2 +; V7A-T-NEXT: bics r0, r2 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_b4_commutative: +; V6M: @ %bb.0: +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: movs r1, #0 +; V6M-NEXT: mvns r1, r1 +; V6M-NEXT: lsls r1, r2 +; V6M-NEXT: bics r0, r1 +; V6M-NEXT: bx lr + %shifted = lshr i32 %val, %numskipbits + %notmask = shl i32 -1, %numlowbits + %mask = xor i32 %notmask, -1 + %masked = and i32 %shifted, %mask ; swapped order + ret i32 %masked +} + +; 64-bit + +define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { +; V7M-LABEL: bextr64_b0: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: ldr.w r12, [sp, #8] +; V7M-NEXT: lsl.w r3, r1, r3 +; V7M-NEXT: orrs r0, r3 +; V7M-NEXT: subs.w r3, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r3 +; V7M-NEXT: lsr.w r1, r1, r2 +; V7M-NEXT: mov.w r2, #-1 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: lsl.w r3, r2, r12 +; V7M-NEXT: subs.w lr, r12, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r2, r2, lr +; V7M-NEXT: it pl +; V7M-NEXT: movpl r3, #0 +; V7M-NEXT: bics r1, r2 +; V7M-NEXT: bics r0, r3 +; V7M-NEXT: pop {r7, pc} +; +; V7A-LABEL: bextr64_b0: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r11, lr} +; V7A-NEXT: push {r11, lr} +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: ldr r12, [sp, #8] +; V7A-NEXT: orr r0, r0, r1, lsl r3 +; V7A-NEXT: subs r3, r2, #32 +; V7A-NEXT: lsrpl r0, r1, r3 +; V7A-NEXT: lsr r1, r1, r2 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: subs lr, r12, #32 +; V7A-NEXT: lsl r2, r3, r12 +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: bic r0, r0, r2 +; V7A-NEXT: lslpl r3, r3, lr +; V7A-NEXT: bic r1, r1, r3 +; V7A-NEXT: pop {r11, pc} +; +; V7A-T-LABEL: bextr64_b0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r4, r5, r7, lr} +; V7A-T-NEXT: push {r4, r5, r7, lr} +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: ldr.w r12, [sp, #16] +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: lsl.w r3, r1, r3 +; V7A-T-NEXT: orr.w r5, r0, r3 +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: subs.w lr, r12, #32 +; V7A-T-NEXT: lsl.w r0, r3, r12 +; V7A-T-NEXT: itt pl +; V7A-T-NEXT: lslpl.w r3, r3, lr +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: subs.w r4, r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r5, r1, r4 +; V7A-T-NEXT: lsr.w r1, r1, r2 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: bic.w r0, r5, r0 +; V7A-T-NEXT: bics r1, r3 +; V7A-T-NEXT: pop {r4, r5, r7, pc} +; +; V6M-LABEL: bextr64_b0: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r7, lr} +; V6M-NEXT: push {r4, r5, r7, lr} +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: mov r4, r0 +; V6M-NEXT: mov r5, r1 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: ldr r2, [sp, #16] +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: bics r4, r0 +; V6M-NEXT: bics r5, r1 +; V6M-NEXT: mov r0, r4 +; V6M-NEXT: mov r1, r5 +; V6M-NEXT: pop {r4, r5, r7, pc} + %shifted = lshr i64 %val, %numskipbits + %notmask = shl i64 -1, %numlowbits + %mask = xor i64 %notmask, -1 + %masked = and i64 %mask, %shifted + ret i64 %masked +} + +define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { +; V7M-LABEL: bextr64_b1_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: lsr.w r12, r0, r2 +; V7M-NEXT: rsb.w r0, r2, #32 +; V7M-NEXT: lsl.w r0, r1, r0 +; V7M-NEXT: orr.w r12, r12, r0 +; V7M-NEXT: subs.w r0, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r12, r1, r0 +; V7M-NEXT: lsr.w r0, r1, r2 +; V7M-NEXT: mov.w r2, #-1 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r0, #0 +; V7M-NEXT: subs.w r1, r3, #32 +; V7M-NEXT: lsl.w r3, r2, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl r2, r1 +; V7M-NEXT: bic.w r1, r0, r2 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r3, #0 +; V7M-NEXT: bic.w r0, r12, r3 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr64_b1_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: lsr r12, r0, r2 +; V7A-NEXT: rsb r0, r2, #32 +; V7A-NEXT: orr r12, r12, r1, lsl r0 +; V7A-NEXT: subs r0, r2, #32 +; V7A-NEXT: lsrpl r12, r1, r0 +; V7A-NEXT: lsr r0, r1, r2 +; V7A-NEXT: movwpl r0, #0 +; V7A-NEXT: subs r1, r3, #32 +; V7A-NEXT: mvn r2, #0 +; V7A-NEXT: lsl r3, r2, r3 +; V7A-NEXT: lslpl r2, r2, r1 +; V7A-NEXT: bic r1, r0, r2 +; V7A-NEXT: movwpl r3, #0 +; V7A-NEXT: bic r0, r12, r3 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr64_b1_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: lsr.w r12, r0, r2 +; V7A-T-NEXT: rsb.w r0, r2, #32 +; V7A-T-NEXT: lsl.w r0, r1, r0 +; V7A-T-NEXT: orr.w r12, r12, r0 +; V7A-T-NEXT: subs.w r0, r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r12, r1, r0 +; V7A-T-NEXT: lsr.w r0, r1, r2 +; V7A-T-NEXT: mov.w r2, #-1 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: subs.w r1, r3, #32 +; V7A-T-NEXT: lsl.w r3, r2, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl r2, r1 +; V7A-T-NEXT: bic.w r1, r0, r2 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r3, #0 +; V7A-T-NEXT: bic.w r0, r12, r3 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr64_b1_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r6, lr} +; V6M-NEXT: push {r4, r5, r6, lr} +; V6M-NEXT: mov r4, r3 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: mov r5, r0 +; V6M-NEXT: mov r6, r1 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: bics r5, r0 +; V6M-NEXT: bics r6, r1 +; V6M-NEXT: mov r0, r5 +; V6M-NEXT: mov r1, r6 +; V6M-NEXT: pop {r4, r5, r6, pc} + %skip = zext i8 %numskipbits to i64 + %shifted = lshr i64 %val, %skip + %conv = zext i8 %numlowbits to i64 + %notmask = shl i64 -1, %conv + %mask = xor i64 %notmask, -1 + %masked = and i64 %mask, %shifted + ret i64 %masked +} + +define i64 @bextr64_b2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind { +; V7M-LABEL: bextr64_b2_load: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: ldrd r0, r3, [r0] +; V7M-NEXT: rsb.w r1, r2, #32 +; V7M-NEXT: ldr.w r12, [sp, #8] +; V7M-NEXT: lsl.w r1, r3, r1 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: orrs r0, r1 +; V7M-NEXT: subs.w r1, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r3, r1 +; V7M-NEXT: lsr.w r1, r3, r2 +; V7M-NEXT: mov.w r2, #-1 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: lsl.w r3, r2, r12 +; V7M-NEXT: subs.w lr, r12, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r2, r2, lr +; V7M-NEXT: it pl +; V7M-NEXT: movpl r3, #0 +; V7M-NEXT: bics r1, r2 +; V7M-NEXT: bics r0, r3 +; V7M-NEXT: pop {r7, pc} +; +; V7A-LABEL: bextr64_b2_load: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r11, lr} +; V7A-NEXT: push {r11, lr} +; V7A-NEXT: ldrd r0, r1, [r0] +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: ldr r12, [sp, #8] +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: orr r0, r0, r1, lsl r3 +; V7A-NEXT: subs r3, r2, #32 +; V7A-NEXT: lsrpl r0, r1, r3 +; V7A-NEXT: lsr r1, r1, r2 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: subs lr, r12, #32 +; V7A-NEXT: lsl r2, r3, r12 +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: bic r0, r0, r2 +; V7A-NEXT: lslpl r3, r3, lr +; V7A-NEXT: bic r1, r1, r3 +; V7A-NEXT: pop {r11, pc} +; +; V7A-T-LABEL: bextr64_b2_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r7, lr} +; V7A-T-NEXT: push {r7, lr} +; V7A-T-NEXT: ldrd r0, r3, [r0] +; V7A-T-NEXT: rsb.w r1, r2, #32 +; V7A-T-NEXT: ldr.w r12, [sp, #8] +; V7A-T-NEXT: lsl.w r1, r3, r1 +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: orrs r0, r1 +; V7A-T-NEXT: subs.w r1, r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r3, r1 +; V7A-T-NEXT: lsr.w r1, r3, r2 +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: lsl.w r2, r3, r12 +; V7A-T-NEXT: subs.w lr, r12, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r3, r3, lr +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r2, #0 +; V7A-T-NEXT: bics r1, r3 +; V7A-T-NEXT: bics r0, r2 +; V7A-T-NEXT: pop {r7, pc} +; +; V6M-LABEL: bextr64_b2_load: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r7, lr} +; V6M-NEXT: push {r4, r5, r7, lr} +; V6M-NEXT: ldr r3, [r0] +; V6M-NEXT: ldr r1, [r0, #4] +; V6M-NEXT: mov r0, r3 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: mov r4, r0 +; V6M-NEXT: mov r5, r1 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: ldr r2, [sp, #16] +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: bics r4, r0 +; V6M-NEXT: bics r5, r1 +; V6M-NEXT: mov r0, r4 +; V6M-NEXT: mov r1, r5 +; V6M-NEXT: pop {r4, r5, r7, pc} + %val = load i64, ptr %w + %shifted = lshr i64 %val, %numskipbits + %notmask = shl i64 -1, %numlowbits + %mask = xor i64 %notmask, -1 + %masked = and i64 %mask, %shifted + ret i64 %masked +} + +define i64 @bextr64_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { +; V7M-LABEL: bextr64_b3_load_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: ldrd r12, r0, [r0] +; V7M-NEXT: rsb.w r3, r1, #32 +; V7M-NEXT: lsl.w lr, r0, r3 +; V7M-NEXT: lsr.w r3, r12, r1 +; V7M-NEXT: orr.w r12, r3, lr +; V7M-NEXT: subs.w r3, r1, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r12, r0, r3 +; V7M-NEXT: lsr.w r0, r0, r1 +; V7M-NEXT: mov.w r3, #-1 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r0, #0 +; V7M-NEXT: subs.w r1, r2, #32 +; V7M-NEXT: lsl.w r2, r3, r2 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl r3, r1 +; V7M-NEXT: bic.w r1, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r2, #0 +; V7M-NEXT: bic.w r0, r12, r2 +; V7M-NEXT: pop {r7, pc} +; +; V7A-LABEL: bextr64_b3_load_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: ldm r0, {r0, r3} +; V7A-NEXT: lsr r12, r0, r1 +; V7A-NEXT: rsb r0, r1, #32 +; V7A-NEXT: orr r12, r12, r3, lsl r0 +; V7A-NEXT: subs r0, r1, #32 +; V7A-NEXT: lsrpl r12, r3, r0 +; V7A-NEXT: lsr r0, r3, r1 +; V7A-NEXT: movwpl r0, #0 +; V7A-NEXT: subs r1, r2, #32 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: lsl r2, r3, r2 +; V7A-NEXT: lslpl r3, r3, r1 +; V7A-NEXT: bic r1, r0, r3 +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: bic r0, r12, r2 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr64_b3_load_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r7, lr} +; V7A-T-NEXT: push {r7, lr} +; V7A-T-NEXT: ldrd r12, r3, [r0] +; V7A-T-NEXT: rsb.w r0, r1, #32 +; V7A-T-NEXT: lsl.w lr, r3, r0 +; V7A-T-NEXT: lsr.w r0, r12, r1 +; V7A-T-NEXT: orr.w r12, r0, lr +; V7A-T-NEXT: subs.w r0, r1, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r12, r3, r0 +; V7A-T-NEXT: lsr.w r0, r3, r1 +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: subs.w r1, r2, #32 +; V7A-T-NEXT: lsl.w r2, r3, r2 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl r3, r1 +; V7A-T-NEXT: bic.w r1, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r2, #0 +; V7A-T-NEXT: bic.w r0, r12, r2 +; V7A-T-NEXT: pop {r7, pc} +; +; V6M-LABEL: bextr64_b3_load_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r6, lr} +; V6M-NEXT: push {r4, r5, r6, lr} +; V6M-NEXT: mov r4, r2 +; V6M-NEXT: mov r2, r1 +; V6M-NEXT: ldr r3, [r0] +; V6M-NEXT: ldr r1, [r0, #4] +; V6M-NEXT: mov r0, r3 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: mov r5, r0 +; V6M-NEXT: mov r6, r1 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: bics r5, r0 +; V6M-NEXT: bics r6, r1 +; V6M-NEXT: mov r0, r5 +; V6M-NEXT: mov r1, r6 +; V6M-NEXT: pop {r4, r5, r6, pc} + %val = load i64, ptr %w + %skip = zext i8 %numskipbits to i64 + %shifted = lshr i64 %val, %skip + %conv = zext i8 %numlowbits to i64 + %notmask = shl i64 -1, %conv + %mask = xor i64 %notmask, -1 + %masked = and i64 %mask, %shifted + ret i64 %masked +} + +define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { +; V7M-LABEL: bextr64_b4_commutative: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: ldr.w r12, [sp, #8] +; V7M-NEXT: lsl.w r3, r1, r3 +; V7M-NEXT: orrs r0, r3 +; V7M-NEXT: subs.w r3, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r3 +; V7M-NEXT: lsr.w r1, r1, r2 +; V7M-NEXT: mov.w r2, #-1 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: lsl.w r3, r2, r12 +; V7M-NEXT: subs.w lr, r12, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r2, r2, lr +; V7M-NEXT: it pl +; V7M-NEXT: movpl r3, #0 +; V7M-NEXT: bics r1, r2 +; V7M-NEXT: bics r0, r3 +; V7M-NEXT: pop {r7, pc} +; +; V7A-LABEL: bextr64_b4_commutative: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r11, lr} +; V7A-NEXT: push {r11, lr} +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: ldr r12, [sp, #8] +; V7A-NEXT: orr r0, r0, r1, lsl r3 +; V7A-NEXT: subs r3, r2, #32 +; V7A-NEXT: lsrpl r0, r1, r3 +; V7A-NEXT: lsr r1, r1, r2 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: subs lr, r12, #32 +; V7A-NEXT: lsl r2, r3, r12 +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: bic r0, r0, r2 +; V7A-NEXT: lslpl r3, r3, lr +; V7A-NEXT: bic r1, r1, r3 +; V7A-NEXT: pop {r11, pc} +; +; V7A-T-LABEL: bextr64_b4_commutative: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r4, r5, r7, lr} +; V7A-T-NEXT: push {r4, r5, r7, lr} +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: ldr.w r12, [sp, #16] +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: lsl.w r3, r1, r3 +; V7A-T-NEXT: orr.w r5, r0, r3 +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: subs.w lr, r12, #32 +; V7A-T-NEXT: lsl.w r0, r3, r12 +; V7A-T-NEXT: itt pl +; V7A-T-NEXT: lslpl.w r3, r3, lr +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: subs.w r4, r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r5, r1, r4 +; V7A-T-NEXT: lsr.w r1, r1, r2 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: bic.w r0, r5, r0 +; V7A-T-NEXT: bics r1, r3 +; V7A-T-NEXT: pop {r4, r5, r7, pc} +; +; V6M-LABEL: bextr64_b4_commutative: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r7, lr} +; V6M-NEXT: push {r4, r5, r7, lr} +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: mov r4, r0 +; V6M-NEXT: mov r5, r1 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: ldr r2, [sp, #16] +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: bics r4, r0 +; V6M-NEXT: bics r5, r1 +; V6M-NEXT: mov r0, r4 +; V6M-NEXT: mov r1, r5 +; V6M-NEXT: pop {r4, r5, r7, pc} + %shifted = lshr i64 %val, %numskipbits + %notmask = shl i64 -1, %numlowbits + %mask = xor i64 %notmask, -1 + %masked = and i64 %shifted, %mask ; swapped order + ret i64 %masked +} + +; 64-bit, but with 32-bit output + +; Everything done in 64-bit, truncation happens last. +define i32 @bextr64_32_b0(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { +; V7M-LABEL: bextr64_32_b0: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: subs r2, #32 +; V7M-NEXT: lsl.w r3, r1, r3 +; V7M-NEXT: orr.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r2 +; V7M-NEXT: ldrb.w r1, [sp] +; V7M-NEXT: mov.w r2, #-1 +; V7M-NEXT: lsls r2, r1 +; V7M-NEXT: subs r1, #32 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r2, #0 +; V7M-NEXT: bics r0, r2 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr64_32_b0: +; V7A: @ %bb.0: +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: ldrb r12, [sp] +; V7A-NEXT: subs r2, r2, #32 +; V7A-NEXT: orr r0, r0, r1, lsl r3 +; V7A-NEXT: lsrpl r0, r1, r2 +; V7A-NEXT: mvn r1, #0 +; V7A-NEXT: lsl r1, r1, r12 +; V7A-NEXT: subs r2, r12, #32 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: bic r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr64_32_b0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: lsr.w r12, r0, r2 +; V7A-T-NEXT: rsb.w r0, r2, #32 +; V7A-T-NEXT: ldrb.w r3, [sp] +; V7A-T-NEXT: subs r2, #32 +; V7A-T-NEXT: lsl.w r0, r1, r0 +; V7A-T-NEXT: orr.w r0, r0, r12 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r2 +; V7A-T-NEXT: mov.w r1, #-1 +; V7A-T-NEXT: lsls r1, r3 +; V7A-T-NEXT: subs.w r2, r3, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: bics r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr64_32_b0: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: mov r4, r0 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: add r1, sp, #8 +; V6M-NEXT: ldrb r2, [r1] +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: bics r4, r0 +; V6M-NEXT: mov r0, r4 +; V6M-NEXT: pop {r4, pc} + %shiftedval = lshr i64 %val, %numskipbits + %widenumlowbits = zext i8 %numlowbits to i64 + %notmask = shl nsw i64 -1, %widenumlowbits + %mask = xor i64 %notmask, -1 + %wideres = and i64 %shiftedval, %mask + %res = trunc i64 %wideres to i32 + ret i32 %res +} + +; Shifting happens in 64-bit, then truncation. Masking is 32-bit. +define i32 @bextr64_32_b1(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { +; V7M-LABEL: bextr64_32_b1: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: subs r2, #32 +; V7M-NEXT: lsl.w r3, r1, r3 +; V7M-NEXT: orr.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r2 +; V7M-NEXT: ldrb.w r1, [sp] +; V7M-NEXT: mov.w r2, #-1 +; V7M-NEXT: lsl.w r1, r2, r1 +; V7M-NEXT: bics r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr64_32_b1: +; V7A: @ %bb.0: +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: ldrb r12, [sp] +; V7A-NEXT: subs r2, r2, #32 +; V7A-NEXT: orr r0, r0, r1, lsl r3 +; V7A-NEXT: lsrpl r0, r1, r2 +; V7A-NEXT: mvn r1, #0 +; V7A-NEXT: bic r0, r0, r1, lsl r12 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr64_32_b1: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: ldrb.w r12, [sp] +; V7A-T-NEXT: subs r2, #32 +; V7A-T-NEXT: lsl.w r3, r1, r3 +; V7A-T-NEXT: orr.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r2 +; V7A-T-NEXT: mov.w r1, #-1 +; V7A-T-NEXT: lsl.w r1, r1, r12 +; V7A-T-NEXT: bics r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr64_32_b1: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r7, lr} +; V6M-NEXT: push {r7, lr} +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: add r1, sp, #8 +; V6M-NEXT: ldrb r1, [r1] +; V6M-NEXT: movs r2, #0 +; V6M-NEXT: mvns r2, r2 +; V6M-NEXT: lsls r2, r1 +; V6M-NEXT: bics r0, r2 +; V6M-NEXT: pop {r7, pc} + %shiftedval = lshr i64 %val, %numskipbits + %truncshiftedval = trunc i64 %shiftedval to i32 + %widenumlowbits = zext i8 %numlowbits to i32 + %notmask = shl nsw i32 -1, %widenumlowbits + %mask = xor i32 %notmask, -1 + %res = and i32 %truncshiftedval, %mask + ret i32 %res +} + +; Shifting happens in 64-bit. Mask is 32-bit, but extended to 64-bit. +; Masking is 64-bit. Then truncation. +define i32 @bextr64_32_b2(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { +; V7M-LABEL: bextr64_32_b2: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: subs r2, #32 +; V7M-NEXT: lsl.w r3, r1, r3 +; V7M-NEXT: orr.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r2 +; V7M-NEXT: ldrb.w r1, [sp] +; V7M-NEXT: mov.w r2, #-1 +; V7M-NEXT: lsl.w r1, r2, r1 +; V7M-NEXT: bics r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr64_32_b2: +; V7A: @ %bb.0: +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: ldrb r12, [sp] +; V7A-NEXT: subs r2, r2, #32 +; V7A-NEXT: orr r0, r0, r1, lsl r3 +; V7A-NEXT: lsrpl r0, r1, r2 +; V7A-NEXT: mvn r1, #0 +; V7A-NEXT: bic r0, r0, r1, lsl r12 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr64_32_b2: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: ldrb.w r12, [sp] +; V7A-T-NEXT: subs r2, #32 +; V7A-T-NEXT: lsl.w r3, r1, r3 +; V7A-T-NEXT: orr.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r2 +; V7A-T-NEXT: mov.w r1, #-1 +; V7A-T-NEXT: lsl.w r1, r1, r12 +; V7A-T-NEXT: bics r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr64_32_b2: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r7, lr} +; V6M-NEXT: push {r7, lr} +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: add r1, sp, #8 +; V6M-NEXT: ldrb r1, [r1] +; V6M-NEXT: movs r2, #0 +; V6M-NEXT: mvns r2, r2 +; V6M-NEXT: lsls r2, r1 +; V6M-NEXT: bics r0, r2 +; V6M-NEXT: pop {r7, pc} + %shiftedval = lshr i64 %val, %numskipbits + %widenumlowbits = zext i8 %numlowbits to i32 + %notmask = shl nsw i32 -1, %widenumlowbits + %mask = xor i32 %notmask, -1 + %zextmask = zext i32 %mask to i64 + %wideres = and i64 %shiftedval, %zextmask + %res = trunc i64 %wideres to i32 + ret i32 %res +} + +; ---------------------------------------------------------------------------- ; +; Pattern c. 32-bit +; ---------------------------------------------------------------------------- ; + +define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { +; V7M-LABEL: bextr32_c0: +; V7M: @ %bb.0: +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: rsb.w r1, r2, #32 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_c0: +; V7A: @ %bb.0: +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: rsb r1, r2, #32 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_c0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: rsb.w r1, r2, #32 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_c0: +; V6M: @ %bb.0: +; V6M-NEXT: movs r3, #32 +; V6M-NEXT: subs r2, r3, r2 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: lsls r0, r2 +; V6M-NEXT: lsrs r0, r2 +; V6M-NEXT: bx lr + %shifted = lshr i32 %val, %numskipbits + %numhighbits = sub i32 32, %numlowbits + %mask = lshr i32 -1, %numhighbits + %masked = and i32 %mask, %shifted + ret i32 %masked +} + +define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind { +; V7M-LABEL: bextr32_c1_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: uxtb r1, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: rsb.w r1, r2, #32 +; V7M-NEXT: uxtb r1, r1 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_c1_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: uxtb r1, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: rsb r1, r2, #32 +; V7A-NEXT: uxtb r1, r1 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_c1_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: uxtb r1, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: rsb.w r1, r2, #32 +; V7A-T-NEXT: uxtb r1, r1 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_c1_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: uxtb r1, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: movs r1, #32 +; V6M-NEXT: subs r1, r1, r2 +; V6M-NEXT: uxtb r1, r1 +; V6M-NEXT: lsls r0, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: bx lr + %skip = zext i8 %numskipbits to i32 + %shifted = lshr i32 %val, %skip + %numhighbits = sub i8 32, %numlowbits + %sh_prom = zext i8 %numhighbits to i32 + %mask = lshr i32 -1, %sh_prom + %masked = and i32 %mask, %shifted + ret i32 %masked +} + +define i32 @bextr32_c2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind { +; V7M-LABEL: bextr32_c2_load: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: rsb.w r1, r2, #32 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_c2_load: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: rsb r1, r2, #32 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_c2_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: rsb.w r1, r2, #32 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_c2_load: +; V6M: @ %bb.0: +; V6M-NEXT: movs r3, #32 +; V6M-NEXT: subs r2, r3, r2 +; V6M-NEXT: ldr r0, [r0] +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: lsls r0, r2 +; V6M-NEXT: lsrs r0, r2 +; V6M-NEXT: bx lr + %val = load i32, ptr %w + %shifted = lshr i32 %val, %numskipbits + %numhighbits = sub i32 32, %numlowbits + %mask = lshr i32 -1, %numhighbits + %masked = and i32 %mask, %shifted + ret i32 %masked +} + +define i32 @bextr32_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind { +; V7M-LABEL: bextr32_c3_load_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: uxtb r1, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: rsb.w r1, r2, #32 +; V7M-NEXT: uxtb r1, r1 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_c3_load_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: uxtb r1, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: rsb r1, r2, #32 +; V7A-NEXT: uxtb r1, r1 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_c3_load_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: uxtb r1, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: rsb.w r1, r2, #32 +; V7A-T-NEXT: uxtb r1, r1 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_c3_load_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: uxtb r1, r1 +; V6M-NEXT: ldr r0, [r0] +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: movs r1, #32 +; V6M-NEXT: subs r1, r1, r2 +; V6M-NEXT: uxtb r1, r1 +; V6M-NEXT: lsls r0, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: bx lr + %val = load i32, ptr %w + %skip = zext i8 %numskipbits to i32 + %shifted = lshr i32 %val, %skip + %numhighbits = sub i8 32, %numlowbits + %sh_prom = zext i8 %numhighbits to i32 + %mask = lshr i32 -1, %sh_prom + %masked = and i32 %mask, %shifted + ret i32 %masked +} + +define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { +; V7M-LABEL: bextr32_c4_commutative: +; V7M: @ %bb.0: +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: rsb.w r1, r2, #32 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_c4_commutative: +; V7A: @ %bb.0: +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: rsb r1, r2, #32 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_c4_commutative: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: rsb.w r1, r2, #32 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_c4_commutative: +; V6M: @ %bb.0: +; V6M-NEXT: movs r3, #32 +; V6M-NEXT: subs r2, r3, r2 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: lsls r0, r2 +; V6M-NEXT: lsrs r0, r2 +; V6M-NEXT: bx lr + %shifted = lshr i32 %val, %numskipbits + %numhighbits = sub i32 32, %numlowbits + %mask = lshr i32 -1, %numhighbits + %masked = and i32 %shifted, %mask ; swapped order + ret i32 %masked +} + +; 64-bit + +define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { +; V7M-LABEL: bextr64_c0: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: ldr.w r12, [sp] +; V7M-NEXT: lsl.w r3, r1, r3 +; V7M-NEXT: orrs r0, r3 +; V7M-NEXT: subs.w r3, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r3 +; V7M-NEXT: rsb.w r3, r12, #64 +; V7M-NEXT: lsr.w r1, r1, r2 +; V7M-NEXT: mov.w r2, #-1 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: lsr.w r3, r2, r3 +; V7M-NEXT: rsbs.w r12, r12, #32 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r3, #0 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r2, r2, r12 +; V7M-NEXT: ands r1, r3 +; V7M-NEXT: ands r0, r2 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr64_c0: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, r5, r11, lr} +; V7A-NEXT: push {r4, r5, r11, lr} +; V7A-NEXT: ldr r12, [sp, #16] +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: lsr r5, r1, r2 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: rsb r4, r12, #64 +; V7A-NEXT: rsbs lr, r12, #32 +; V7A-NEXT: lsr r4, r3, r4 +; V7A-NEXT: lsrpl r3, r3, lr +; V7A-NEXT: movwpl r4, #0 +; V7A-NEXT: subs lr, r2, #32 +; V7A-NEXT: rsb r2, r2, #32 +; V7A-NEXT: movwpl r5, #0 +; V7A-NEXT: and r12, r4, r5 +; V7A-NEXT: orr r0, r0, r1, lsl r2 +; V7A-NEXT: lsrpl r0, r1, lr +; V7A-NEXT: mov r1, r12 +; V7A-NEXT: and r0, r3, r0 +; V7A-NEXT: pop {r4, r5, r11, pc} +; +; V7A-T-LABEL: bextr64_c0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r7, lr} +; V7A-T-NEXT: push {r7, lr} +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: ldr.w r12, [sp, #8] +; V7A-T-NEXT: mov.w lr, #-1 +; V7A-T-NEXT: lsl.w r3, r1, r3 +; V7A-T-NEXT: orrs r0, r3 +; V7A-T-NEXT: subs.w r3, r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r3 +; V7A-T-NEXT: lsr.w r1, r1, r2 +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: rsbs.w r2, r12, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl r3, r2 +; V7A-T-NEXT: rsb.w r2, r12, #64 +; V7A-T-NEXT: and.w r0, r0, r3 +; V7A-T-NEXT: lsr.w r2, lr, r2 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r2, #0 +; V7A-T-NEXT: ands r1, r2 +; V7A-T-NEXT: pop {r7, pc} +; +; V6M-LABEL: bextr64_c0: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r7, lr} +; V6M-NEXT: push {r4, r5, r7, lr} +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: mov r5, r0 +; V6M-NEXT: mov r4, r1 +; V6M-NEXT: ldr r0, [sp, #16] +; V6M-NEXT: movs r1, #64 +; V6M-NEXT: subs r2, r1, r0 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ands r0, r5 +; V6M-NEXT: ands r1, r4 +; V6M-NEXT: pop {r4, r5, r7, pc} + %shifted = lshr i64 %val, %numskipbits + %numhighbits = sub i64 64, %numlowbits + %mask = lshr i64 -1, %numhighbits + %masked = and i64 %mask, %shifted + ret i64 %masked +} + +define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind { +; V7M-LABEL: bextr64_c1_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: uxtb r2, r2 +; V7M-NEXT: lsr.w r12, r0, r2 +; V7M-NEXT: rsb.w r0, r2, #32 +; V7M-NEXT: lsl.w r0, r1, r0 +; V7M-NEXT: orr.w r12, r12, r0 +; V7M-NEXT: subs.w r0, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r12, r1, r0 +; V7M-NEXT: rsb.w r0, r3, #64 +; V7M-NEXT: lsr.w r1, r1, r2 +; V7M-NEXT: mov.w r3, #-1 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: uxtb r0, r0 +; V7M-NEXT: subs.w lr, r0, #32 +; V7M-NEXT: lsr.w r2, r3, r0 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r3, r3, lr +; V7M-NEXT: it pl +; V7M-NEXT: movpl r2, #0 +; V7M-NEXT: and.w r0, r3, r12 +; V7M-NEXT: ands r1, r2 +; V7M-NEXT: pop {r7, pc} +; +; V7A-LABEL: bextr64_c1_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, lr} +; V7A-NEXT: push {r4, lr} +; V7A-NEXT: uxtb r12, r2 +; V7A-NEXT: lsr lr, r0, r12 +; V7A-NEXT: rsb r0, r12, #32 +; V7A-NEXT: orr r4, lr, r1, lsl r0 +; V7A-NEXT: mvn lr, #31 +; V7A-NEXT: uxtab r2, lr, r2 +; V7A-NEXT: cmp r2, #0 +; V7A-NEXT: lsrpl r4, r1, r2 +; V7A-NEXT: rsb r2, r3, #64 +; V7A-NEXT: lsr r1, r1, r12 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: uxtb r12, r2 +; V7A-NEXT: uxtab r2, lr, r2 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: lsr r0, r3, r12 +; V7A-NEXT: cmp r2, #0 +; V7A-NEXT: movwpl r0, #0 +; V7A-NEXT: and r1, r0, r1 +; V7A-NEXT: lsrpl r3, r3, r2 +; V7A-NEXT: and r0, r3, r4 +; V7A-NEXT: pop {r4, pc} +; +; V7A-T-LABEL: bextr64_c1_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r4, lr} +; V7A-T-NEXT: push {r4, lr} +; V7A-T-NEXT: uxtb.w r12, r2 +; V7A-T-NEXT: lsr.w lr, r0, r12 +; V7A-T-NEXT: rsb.w r0, r12, #32 +; V7A-T-NEXT: lsl.w r0, r1, r0 +; V7A-T-NEXT: orr.w r4, lr, r0 +; V7A-T-NEXT: mvn lr, #31 +; V7A-T-NEXT: uxtab r2, lr, r2 +; V7A-T-NEXT: cmp r2, #0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r4, r1, r2 +; V7A-T-NEXT: rsb.w r2, r3, #64 +; V7A-T-NEXT: lsr.w r1, r1, r12 +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: uxtb.w r12, r2 +; V7A-T-NEXT: uxtab r2, lr, r2 +; V7A-T-NEXT: lsr.w r0, r3, r12 +; V7A-T-NEXT: cmp r2, #0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: and.w r1, r1, r0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl r3, r2 +; V7A-T-NEXT: and.w r0, r3, r4 +; V7A-T-NEXT: pop {r4, pc} +; +; V6M-LABEL: bextr64_c1_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r6, lr} +; V6M-NEXT: push {r4, r5, r6, lr} +; V6M-NEXT: mov r5, r3 +; V6M-NEXT: uxtb r2, r2 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: mov r6, r0 +; V6M-NEXT: mov r4, r1 +; V6M-NEXT: movs r0, #64 +; V6M-NEXT: subs r0, r0, r5 +; V6M-NEXT: uxtb r2, r0 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ands r0, r6 +; V6M-NEXT: ands r1, r4 +; V6M-NEXT: pop {r4, r5, r6, pc} + %skip = zext i8 %numskipbits to i64 + %shifted = lshr i64 %val, %skip + %numhighbits = sub i8 64, %numlowbits + %sh_prom = zext i8 %numhighbits to i64 + %mask = lshr i64 -1, %sh_prom + %masked = and i64 %mask, %shifted + ret i64 %masked +} + +define i64 @bextr64_c2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind { +; V7M-LABEL: bextr64_c2_load: +; V7M: @ %bb.0: +; V7M-NEXT: ldrd r0, r3, [r0] +; V7M-NEXT: rsb.w r1, r2, #32 +; V7M-NEXT: ldr.w r12, [sp] +; V7M-NEXT: lsl.w r1, r3, r1 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: orrs r0, r1 +; V7M-NEXT: subs.w r1, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r3, r1 +; V7M-NEXT: lsr.w r1, r3, r2 +; V7M-NEXT: rsb.w r3, r12, #64 +; V7M-NEXT: mov.w r2, #-1 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: rsbs.w r12, r12, #32 +; V7M-NEXT: lsr.w r3, r2, r3 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r3, #0 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r2, r2, r12 +; V7M-NEXT: ands r1, r3 +; V7M-NEXT: ands r0, r2 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr64_c2_load: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, r6, r8, lr} +; V7A-NEXT: push {r4, r6, r8, lr} +; V7A-NEXT: ldr r12, [sp, #16] +; V7A-NEXT: ldr r3, [r0, #4] +; V7A-NEXT: rsb r6, r12, #64 +; V7A-NEXT: ldr r8, [r0] +; V7A-NEXT: mvn r0, #0 +; V7A-NEXT: rsbs r1, r12, #32 +; V7A-NEXT: lsr r6, r0, r6 +; V7A-NEXT: lsr r4, r3, r2 +; V7A-NEXT: lsrpl r0, r0, r1 +; V7A-NEXT: movwpl r6, #0 +; V7A-NEXT: subs r12, r2, #32 +; V7A-NEXT: movwpl r4, #0 +; V7A-NEXT: and r1, r6, r4 +; V7A-NEXT: lsr r6, r8, r2 +; V7A-NEXT: rsb r2, r2, #32 +; V7A-NEXT: orr r2, r6, r3, lsl r2 +; V7A-NEXT: lsrpl r2, r3, r12 +; V7A-NEXT: and r0, r0, r2 +; V7A-NEXT: pop {r4, r6, r8, pc} +; +; V7A-T-LABEL: bextr64_c2_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldrd r0, r3, [r0] +; V7A-T-NEXT: rsb.w r1, r2, #32 +; V7A-T-NEXT: ldr.w r12, [sp] +; V7A-T-NEXT: lsl.w r1, r3, r1 +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: orrs r0, r1 +; V7A-T-NEXT: subs.w r1, r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r3, r1 +; V7A-T-NEXT: lsr.w r1, r3, r2 +; V7A-T-NEXT: rsb.w r2, r12, #64 +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: rsbs.w r12, r12, #32 +; V7A-T-NEXT: lsr.w r2, r3, r2 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r2, #0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r3, r3, r12 +; V7A-T-NEXT: ands r1, r2 +; V7A-T-NEXT: ands r0, r3 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr64_c2_load: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r7, lr} +; V6M-NEXT: push {r4, r5, r7, lr} +; V6M-NEXT: ldr r3, [r0] +; V6M-NEXT: ldr r1, [r0, #4] +; V6M-NEXT: mov r0, r3 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: mov r5, r0 +; V6M-NEXT: mov r4, r1 +; V6M-NEXT: ldr r0, [sp, #16] +; V6M-NEXT: movs r1, #64 +; V6M-NEXT: subs r2, r1, r0 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ands r0, r5 +; V6M-NEXT: ands r1, r4 +; V6M-NEXT: pop {r4, r5, r7, pc} + %val = load i64, ptr %w + %shifted = lshr i64 %val, %numskipbits + %numhighbits = sub i64 64, %numlowbits + %mask = lshr i64 -1, %numhighbits + %masked = and i64 %mask, %shifted + ret i64 %masked +} + +define i64 @bextr64_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind { +; V7M-LABEL: bextr64_c3_load_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: ldrd r0, r3, [r0] +; V7M-NEXT: uxtb r1, r1 +; V7M-NEXT: lsr.w r12, r0, r1 +; V7M-NEXT: rsb.w r0, r1, #32 +; V7M-NEXT: lsl.w r0, r3, r0 +; V7M-NEXT: orr.w r12, r12, r0 +; V7M-NEXT: subs.w r0, r1, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r12, r3, r0 +; V7M-NEXT: rsb.w r0, r2, #64 +; V7M-NEXT: lsr.w r1, r3, r1 +; V7M-NEXT: mov.w r3, #-1 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: uxtb r0, r0 +; V7M-NEXT: subs.w lr, r0, #32 +; V7M-NEXT: lsr.w r2, r3, r0 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r3, r3, lr +; V7M-NEXT: it pl +; V7M-NEXT: movpl r2, #0 +; V7M-NEXT: and.w r0, r3, r12 +; V7M-NEXT: ands r1, r2 +; V7M-NEXT: pop {r7, pc} +; +; V7A-LABEL: bextr64_c3_load_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, lr} +; V7A-NEXT: push {r4, lr} +; V7A-NEXT: ldr r4, [r0] +; V7A-NEXT: ldr r3, [r0, #4] +; V7A-NEXT: uxtb r0, r1 +; V7A-NEXT: lsr r12, r4, r0 +; V7A-NEXT: rsb r4, r0, #32 +; V7A-NEXT: lsr r0, r3, r0 +; V7A-NEXT: orr lr, r12, r3, lsl r4 +; V7A-NEXT: mvn r12, #31 +; V7A-NEXT: uxtab r1, r12, r1 +; V7A-NEXT: cmp r1, #0 +; V7A-NEXT: lsrpl lr, r3, r1 +; V7A-NEXT: rsb r1, r2, #64 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: movwpl r0, #0 +; V7A-NEXT: uxtb r2, r1 +; V7A-NEXT: uxtab r4, r12, r1 +; V7A-NEXT: lsr r2, r3, r2 +; V7A-NEXT: cmp r4, #0 +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: and r1, r2, r0 +; V7A-NEXT: lsrpl r3, r3, r4 +; V7A-NEXT: and r0, r3, lr +; V7A-NEXT: pop {r4, pc} +; +; V7A-T-LABEL: bextr64_c3_load_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r4, r5, r7, lr} +; V7A-T-NEXT: push {r4, r5, r7, lr} +; V7A-T-NEXT: ldrd r12, lr, [r0] +; V7A-T-NEXT: uxtb r0, r1 +; V7A-T-NEXT: rsb.w r3, r0, #32 +; V7A-T-NEXT: lsl.w r4, lr, r3 +; V7A-T-NEXT: lsr.w r3, r12, r0 +; V7A-T-NEXT: orr.w r5, r3, r4 +; V7A-T-NEXT: mvn r12, #31 +; V7A-T-NEXT: uxtab r1, r12, r1 +; V7A-T-NEXT: lsr.w r0, lr, r0 +; V7A-T-NEXT: cmp r1, #0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r5, lr, r1 +; V7A-T-NEXT: rsb.w r1, r2, #64 +; V7A-T-NEXT: mov.w r4, #-1 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: uxtb r2, r1 +; V7A-T-NEXT: uxtab r3, r12, r1 +; V7A-T-NEXT: lsr.w r2, r4, r2 +; V7A-T-NEXT: cmp r3, #0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r2, #0 +; V7A-T-NEXT: and.w r1, r2, r0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl r4, r3 +; V7A-T-NEXT: and.w r0, r4, r5 +; V7A-T-NEXT: pop {r4, r5, r7, pc} +; +; V6M-LABEL: bextr64_c3_load_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r6, lr} +; V6M-NEXT: push {r4, r5, r6, lr} +; V6M-NEXT: mov r5, r2 +; V6M-NEXT: ldr r4, [r0] +; V6M-NEXT: ldr r3, [r0, #4] +; V6M-NEXT: uxtb r2, r1 +; V6M-NEXT: mov r0, r4 +; V6M-NEXT: mov r1, r3 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: mov r6, r0 +; V6M-NEXT: mov r4, r1 +; V6M-NEXT: movs r0, #64 +; V6M-NEXT: subs r0, r0, r5 +; V6M-NEXT: uxtb r2, r0 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ands r0, r6 +; V6M-NEXT: ands r1, r4 +; V6M-NEXT: pop {r4, r5, r6, pc} + %val = load i64, ptr %w + %skip = zext i8 %numskipbits to i64 + %shifted = lshr i64 %val, %skip + %numhighbits = sub i8 64, %numlowbits + %sh_prom = zext i8 %numhighbits to i64 + %mask = lshr i64 -1, %sh_prom + %masked = and i64 %mask, %shifted + ret i64 %masked +} + +define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { +; V7M-LABEL: bextr64_c4_commutative: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: ldr.w r12, [sp] +; V7M-NEXT: lsl.w r3, r1, r3 +; V7M-NEXT: orrs r0, r3 +; V7M-NEXT: subs.w r3, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r3 +; V7M-NEXT: rsb.w r3, r12, #64 +; V7M-NEXT: lsr.w r1, r1, r2 +; V7M-NEXT: mov.w r2, #-1 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: lsr.w r3, r2, r3 +; V7M-NEXT: rsbs.w r12, r12, #32 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r3, #0 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r2, r2, r12 +; V7M-NEXT: ands r1, r3 +; V7M-NEXT: ands r0, r2 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr64_c4_commutative: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, r5, r11, lr} +; V7A-NEXT: push {r4, r5, r11, lr} +; V7A-NEXT: ldr r12, [sp, #16] +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: lsr r5, r1, r2 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: rsb r4, r12, #64 +; V7A-NEXT: rsbs lr, r12, #32 +; V7A-NEXT: lsr r4, r3, r4 +; V7A-NEXT: lsrpl r3, r3, lr +; V7A-NEXT: movwpl r4, #0 +; V7A-NEXT: subs lr, r2, #32 +; V7A-NEXT: rsb r2, r2, #32 +; V7A-NEXT: movwpl r5, #0 +; V7A-NEXT: and r12, r5, r4 +; V7A-NEXT: orr r0, r0, r1, lsl r2 +; V7A-NEXT: lsrpl r0, r1, lr +; V7A-NEXT: mov r1, r12 +; V7A-NEXT: and r0, r0, r3 +; V7A-NEXT: pop {r4, r5, r11, pc} +; +; V7A-T-LABEL: bextr64_c4_commutative: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r7, lr} +; V7A-T-NEXT: push {r7, lr} +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: ldr.w r12, [sp, #8] +; V7A-T-NEXT: mov.w lr, #-1 +; V7A-T-NEXT: lsl.w r3, r1, r3 +; V7A-T-NEXT: orrs r0, r3 +; V7A-T-NEXT: subs.w r3, r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r3 +; V7A-T-NEXT: lsr.w r1, r1, r2 +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: rsbs.w r2, r12, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl r3, r2 +; V7A-T-NEXT: rsb.w r2, r12, #64 +; V7A-T-NEXT: and.w r0, r0, r3 +; V7A-T-NEXT: lsr.w r2, lr, r2 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r2, #0 +; V7A-T-NEXT: ands r1, r2 +; V7A-T-NEXT: pop {r7, pc} +; +; V6M-LABEL: bextr64_c4_commutative: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r7, lr} +; V6M-NEXT: push {r4, r5, r7, lr} +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: mov r5, r0 +; V6M-NEXT: mov r4, r1 +; V6M-NEXT: ldr r0, [sp, #16] +; V6M-NEXT: movs r1, #64 +; V6M-NEXT: subs r2, r1, r0 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ands r0, r5 +; V6M-NEXT: ands r1, r4 +; V6M-NEXT: pop {r4, r5, r7, pc} + %shifted = lshr i64 %val, %numskipbits + %numhighbits = sub i64 64, %numlowbits + %mask = lshr i64 -1, %numhighbits + %masked = and i64 %shifted, %mask ; swapped order + ret i64 %masked +} + +; 64-bit, but with 32-bit output + +; Everything done in 64-bit, truncation happens last. +define i32 @bextr64_32_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { +; V7M-LABEL: bextr64_32_c0: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: subs r2, #32 +; V7M-NEXT: lsl.w r3, r1, r3 +; V7M-NEXT: orr.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r2 +; V7M-NEXT: ldr r1, [sp] +; V7M-NEXT: mov.w r2, #-1 +; V7M-NEXT: rsbs.w r1, r1, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl r2, r1 +; V7M-NEXT: ands r0, r2 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr64_32_c0: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r3, [sp] +; V7A-NEXT: rsbs r12, r3, #32 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: lsrpl r3, r3, r12 +; V7A-NEXT: lsr r12, r0, r2 +; V7A-NEXT: rsb r0, r2, #32 +; V7A-NEXT: subs r2, r2, #32 +; V7A-NEXT: orr r0, r12, r1, lsl r0 +; V7A-NEXT: lsrpl r0, r1, r2 +; V7A-NEXT: and r0, r3, r0 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr64_32_c0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: ldr.w r12, [sp] +; V7A-T-NEXT: subs r2, #32 +; V7A-T-NEXT: lsl.w r3, r1, r3 +; V7A-T-NEXT: orr.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r2 +; V7A-T-NEXT: mov.w r2, #-1 +; V7A-T-NEXT: rsbs.w r1, r12, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl r2, r1 +; V7A-T-NEXT: ands r0, r2 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr64_32_c0: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: mov r4, r0 +; V6M-NEXT: ldr r0, [sp, #8] +; V6M-NEXT: movs r1, #64 +; V6M-NEXT: subs r2, r1, r0 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ands r0, r4 +; V6M-NEXT: pop {r4, pc} + %shifted = lshr i64 %val, %numskipbits + %numhighbits = sub i64 64, %numlowbits + %mask = lshr i64 -1, %numhighbits + %masked = and i64 %mask, %shifted + %res = trunc i64 %masked to i32 + ret i32 %res +} + +; Shifting happens in 64-bit, then truncation. Masking is 32-bit. +define i32 @bextr64_32_c1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind { +; V7M-LABEL: bextr64_32_c1: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: subs r2, #32 +; V7M-NEXT: lsl.w r3, r1, r3 +; V7M-NEXT: orr.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r2 +; V7M-NEXT: ldr r1, [sp] +; V7M-NEXT: rsb.w r1, r1, #32 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr64_32_c1: +; V7A: @ %bb.0: +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: ldr r12, [sp] +; V7A-NEXT: subs r2, r2, #32 +; V7A-NEXT: orr r0, r0, r1, lsl r3 +; V7A-NEXT: lsrpl r0, r1, r2 +; V7A-NEXT: rsb r1, r12, #32 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr64_32_c1: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: ldr.w r12, [sp] +; V7A-T-NEXT: subs r2, #32 +; V7A-T-NEXT: lsl.w r3, r1, r3 +; V7A-T-NEXT: orr.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r2 +; V7A-T-NEXT: rsb.w r1, r12, #32 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr64_32_c1: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r7, lr} +; V6M-NEXT: push {r7, lr} +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ldr r1, [sp, #8] +; V6M-NEXT: movs r2, #32 +; V6M-NEXT: subs r1, r2, r1 +; V6M-NEXT: lsls r0, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: pop {r7, pc} + %shifted = lshr i64 %val, %numskipbits + %truncshifted = trunc i64 %shifted to i32 + %numhighbits = sub i32 32, %numlowbits + %mask = lshr i32 -1, %numhighbits + %masked = and i32 %mask, %truncshifted + ret i32 %masked +} + +; Shifting happens in 64-bit. Mask is 32-bit, but extended to 64-bit. +; Masking is 64-bit. Then truncation. +define i32 @bextr64_32_c2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind { +; V7M-LABEL: bextr64_32_c2: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: subs r2, #32 +; V7M-NEXT: lsl.w r3, r1, r3 +; V7M-NEXT: orr.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r2 +; V7M-NEXT: ldr r1, [sp] +; V7M-NEXT: rsb.w r1, r1, #32 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr64_32_c2: +; V7A: @ %bb.0: +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: ldr r12, [sp] +; V7A-NEXT: subs r2, r2, #32 +; V7A-NEXT: orr r0, r0, r1, lsl r3 +; V7A-NEXT: lsrpl r0, r1, r2 +; V7A-NEXT: rsb r1, r12, #32 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr64_32_c2: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: ldr.w r12, [sp] +; V7A-T-NEXT: subs r2, #32 +; V7A-T-NEXT: lsl.w r3, r1, r3 +; V7A-T-NEXT: orr.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r2 +; V7A-T-NEXT: rsb.w r1, r12, #32 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr64_32_c2: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r7, lr} +; V6M-NEXT: push {r7, lr} +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ldr r1, [sp, #8] +; V6M-NEXT: movs r2, #32 +; V6M-NEXT: subs r1, r2, r1 +; V6M-NEXT: lsls r0, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: pop {r7, pc} + %shifted = lshr i64 %val, %numskipbits + %numhighbits = sub i32 32, %numlowbits + %mask = lshr i32 -1, %numhighbits + %zextmask = zext i32 %mask to i64 + %masked = and i64 %zextmask, %shifted + %truncmasked = trunc i64 %masked to i32 + ret i32 %truncmasked +} + +; ---------------------------------------------------------------------------- ; +; Pattern d. 32-bit. +; ---------------------------------------------------------------------------- ; + +define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { +; V7M-LABEL: bextr32_d0: +; V7M: @ %bb.0: +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: rsb.w r1, r2, #32 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_d0: +; V7A: @ %bb.0: +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: rsb r1, r2, #32 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_d0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: rsb.w r1, r2, #32 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_d0: +; V6M: @ %bb.0: +; V6M-NEXT: movs r3, #32 +; V6M-NEXT: subs r2, r3, r2 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: lsls r0, r2 +; V6M-NEXT: lsrs r0, r2 +; V6M-NEXT: bx lr + %shifted = lshr i32 %val, %numskipbits + %numhighbits = sub i32 32, %numlowbits + %highbitscleared = shl i32 %shifted, %numhighbits + %masked = lshr i32 %highbitscleared, %numhighbits + ret i32 %masked +} + +define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind { +; V7M-LABEL: bextr32_d1_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: uxtb r1, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: rsb.w r1, r2, #32 +; V7M-NEXT: uxtb r1, r1 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_d1_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: uxtb r1, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: rsb r1, r2, #32 +; V7A-NEXT: uxtb r1, r1 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_d1_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: uxtb r1, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: rsb.w r1, r2, #32 +; V7A-T-NEXT: uxtb r1, r1 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_d1_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: uxtb r1, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: movs r1, #32 +; V6M-NEXT: subs r1, r1, r2 +; V6M-NEXT: uxtb r1, r1 +; V6M-NEXT: lsls r0, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: bx lr + %skip = zext i8 %numskipbits to i32 + %shifted = lshr i32 %val, %skip + %numhighbits = sub i8 32, %numlowbits + %sh_prom = zext i8 %numhighbits to i32 + %highbitscleared = shl i32 %shifted, %sh_prom + %masked = lshr i32 %highbitscleared, %sh_prom + ret i32 %masked +} + +define i32 @bextr32_d2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind { +; V7M-LABEL: bextr32_d2_load: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: rsb.w r1, r2, #32 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_d2_load: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: rsb r1, r2, #32 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_d2_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: rsb.w r1, r2, #32 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_d2_load: +; V6M: @ %bb.0: +; V6M-NEXT: movs r3, #32 +; V6M-NEXT: subs r2, r3, r2 +; V6M-NEXT: ldr r0, [r0] +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: lsls r0, r2 +; V6M-NEXT: lsrs r0, r2 +; V6M-NEXT: bx lr + %val = load i32, ptr %w + %shifted = lshr i32 %val, %numskipbits + %numhighbits = sub i32 32, %numlowbits + %highbitscleared = shl i32 %shifted, %numhighbits + %masked = lshr i32 %highbitscleared, %numhighbits + ret i32 %masked +} + +define i32 @bextr32_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind { +; V7M-LABEL: bextr32_d3_load_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: uxtb r1, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: rsb.w r1, r2, #32 +; V7M-NEXT: uxtb r1, r1 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr32_d3_load_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: uxtb r1, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: rsb r1, r2, #32 +; V7A-NEXT: uxtb r1, r1 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr32_d3_load_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: uxtb r1, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: rsb.w r1, r2, #32 +; V7A-T-NEXT: uxtb r1, r1 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr32_d3_load_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: uxtb r1, r1 +; V6M-NEXT: ldr r0, [r0] +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: movs r1, #32 +; V6M-NEXT: subs r1, r1, r2 +; V6M-NEXT: uxtb r1, r1 +; V6M-NEXT: lsls r0, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: bx lr + %val = load i32, ptr %w + %skip = zext i8 %numskipbits to i32 + %shifted = lshr i32 %val, %skip + %numhighbits = sub i8 32, %numlowbits + %sh_prom = zext i8 %numhighbits to i32 + %highbitscleared = shl i32 %shifted, %sh_prom + %masked = lshr i32 %highbitscleared, %sh_prom + ret i32 %masked +} + +; 64-bit. + +define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { +; V7M-LABEL: bextr64_d0: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r4, lr} +; V7M-NEXT: push {r4, lr} +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: ldr.w r12, [sp, #8] +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: lsl.w r3, r1, r3 +; V7M-NEXT: orrs r0, r3 +; V7M-NEXT: subs.w r3, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r3 +; V7M-NEXT: lsr.w r1, r1, r2 +; V7M-NEXT: rsb.w r3, r12, #64 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: rsb.w lr, r12, #32 +; V7M-NEXT: rsb.w r12, r3, #32 +; V7M-NEXT: lsls r1, r3 +; V7M-NEXT: cmp.w lr, #0 +; V7M-NEXT: lsr.w r4, r0, r12 +; V7M-NEXT: orr.w r1, r1, r4 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r1, r0, lr +; V7M-NEXT: lsl.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r0, #0 +; V7M-NEXT: lsl.w r2, r1, r12 +; V7M-NEXT: lsr.w r0, r0, r3 +; V7M-NEXT: orr.w r0, r0, r2 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, lr +; V7M-NEXT: lsr.w r1, r1, r3 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: pop {r4, pc} +; +; V7A-LABEL: bextr64_d0: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r11, lr} +; V7A-NEXT: push {r11, lr} +; V7A-NEXT: lsr r3, r1, r2 +; V7A-NEXT: subs lr, r2, #32 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: rsb r2, r2, #32 +; V7A-NEXT: ldr r12, [sp, #8] +; V7A-NEXT: movwpl r3, #0 +; V7A-NEXT: orr r0, r0, r1, lsl r2 +; V7A-NEXT: lsrpl r0, r1, lr +; V7A-NEXT: rsb r1, r12, #64 +; V7A-NEXT: rsb lr, r1, #32 +; V7A-NEXT: lsr r2, r0, lr +; V7A-NEXT: orr r2, r2, r3, lsl r1 +; V7A-NEXT: rsbs r3, r12, #32 +; V7A-NEXT: lslpl r2, r0, r3 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: movwpl r0, #0 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: lsr r1, r2, r1 +; V7A-NEXT: orr r0, r0, r2, lsl lr +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: lsrpl r0, r2, r3 +; V7A-NEXT: pop {r11, pc} +; +; V7A-T-LABEL: bextr64_d0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r4, lr} +; V7A-T-NEXT: push {r4, lr} +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: ldr.w r12, [sp, #8] +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: lsl.w r3, r1, r3 +; V7A-T-NEXT: orrs r0, r3 +; V7A-T-NEXT: subs.w r3, r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r3 +; V7A-T-NEXT: lsr.w r1, r1, r2 +; V7A-T-NEXT: rsb.w r3, r12, #64 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: rsb.w lr, r3, #32 +; V7A-T-NEXT: lsls r1, r3 +; V7A-T-NEXT: rsbs.w r2, r12, #32 +; V7A-T-NEXT: lsr.w r4, r0, lr +; V7A-T-NEXT: orr.w r1, r1, r4 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r1, r0, r2 +; V7A-T-NEXT: lsl.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: lsl.w r4, r1, lr +; V7A-T-NEXT: lsr.w r0, r0, r3 +; V7A-T-NEXT: orr.w r0, r0, r4 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r2 +; V7A-T-NEXT: lsr.w r1, r1, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: pop {r4, pc} +; +; V6M-LABEL: bextr64_d0: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ldr r2, [sp, #8] +; V6M-NEXT: movs r3, #64 +; V6M-NEXT: subs r4, r3, r2 +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: pop {r4, pc} + %shifted = lshr i64 %val, %numskipbits + %numhighbits = sub i64 64, %numlowbits + %highbitscleared = shl i64 %shifted, %numhighbits + %masked = lshr i64 %highbitscleared, %numhighbits + ret i64 %masked +} + +define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind { +; V7M-LABEL: bextr64_d1_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r4, lr} +; V7M-NEXT: push {r4, lr} +; V7M-NEXT: uxtb.w lr, r2 +; V7M-NEXT: subs.w r2, lr, #32 +; V7M-NEXT: lsr.w r12, r0, lr +; V7M-NEXT: rsb.w r0, lr, #32 +; V7M-NEXT: lsl.w r0, r1, r0 +; V7M-NEXT: orr.w r0, r0, r12 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r2 +; V7M-NEXT: rsb.w r2, r3, #64 +; V7M-NEXT: lsr.w r1, r1, lr +; V7M-NEXT: uxtb r2, r2 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: rsb.w r12, r2, #32 +; V7M-NEXT: lsls r1, r2 +; V7M-NEXT: sub.w r3, r2, #32 +; V7M-NEXT: lsr.w r4, r0, r12 +; V7M-NEXT: orrs r1, r4 +; V7M-NEXT: cmp r3, #0 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r1, r0, r3 +; V7M-NEXT: lsl.w r0, r0, r2 +; V7M-NEXT: lsl.w r4, r1, r12 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r0, #0 +; V7M-NEXT: lsr.w r0, r0, r2 +; V7M-NEXT: orr.w r0, r0, r4 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r3 +; V7M-NEXT: lsr.w r1, r1, r2 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: pop {r4, pc} +; +; V7A-LABEL: bextr64_d1_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, r5, r11, lr} +; V7A-NEXT: push {r4, r5, r11, lr} +; V7A-NEXT: uxtb r12, r2 +; V7A-NEXT: lsr lr, r0, r12 +; V7A-NEXT: rsb r0, r12, #32 +; V7A-NEXT: orr r0, lr, r1, lsl r0 +; V7A-NEXT: mvn lr, #31 +; V7A-NEXT: uxtab r2, lr, r2 +; V7A-NEXT: cmp r2, #0 +; V7A-NEXT: lsrpl r0, r1, r2 +; V7A-NEXT: rsb r2, r3, #64 +; V7A-NEXT: lsr r1, r1, r12 +; V7A-NEXT: uxtb r3, r2 +; V7A-NEXT: rsb r4, r3, #32 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: uxtab r2, lr, r2 +; V7A-NEXT: lsr r5, r0, r4 +; V7A-NEXT: orr r1, r5, r1, lsl r3 +; V7A-NEXT: cmp r2, #0 +; V7A-NEXT: lslpl r1, r0, r2 +; V7A-NEXT: lsl r0, r0, r3 +; V7A-NEXT: movwpl r0, #0 +; V7A-NEXT: lsr r0, r0, r3 +; V7A-NEXT: orr r0, r0, r1, lsl r4 +; V7A-NEXT: lsrpl r0, r1, r2 +; V7A-NEXT: lsr r1, r1, r3 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: pop {r4, r5, r11, pc} +; +; V7A-T-LABEL: bextr64_d1_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r4, r5, r6, r7, lr} +; V7A-T-NEXT: push {r4, r5, r6, r7, lr} +; V7A-T-NEXT: uxtb.w r12, r2 +; V7A-T-NEXT: rsb.w r6, r12, #32 +; V7A-T-NEXT: rsb.w r3, r3, #64 +; V7A-T-NEXT: lsr.w r0, r0, r12 +; V7A-T-NEXT: mvn r7, #31 +; V7A-T-NEXT: uxtab r2, r7, r2 +; V7A-T-NEXT: lsl.w r6, r1, r6 +; V7A-T-NEXT: lsr.w lr, r1, r12 +; V7A-T-NEXT: orrs r0, r6 +; V7A-T-NEXT: cmp r2, #0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl.w lr, #0 +; V7A-T-NEXT: uxtb r5, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r2 +; V7A-T-NEXT: rsb.w r1, r5, #32 +; V7A-T-NEXT: uxtab r3, r7, r3 +; V7A-T-NEXT: lsl.w r4, lr, r5 +; V7A-T-NEXT: lsr.w r2, r0, r1 +; V7A-T-NEXT: cmp r3, #0 +; V7A-T-NEXT: orr.w r2, r2, r4 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r2, r0, r3 +; V7A-T-NEXT: lsl.w r0, r0, r5 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: lsl.w r1, r2, r1 +; V7A-T-NEXT: lsr.w r0, r0, r5 +; V7A-T-NEXT: orr.w r0, r0, r1 +; V7A-T-NEXT: lsr.w r1, r2, r5 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r2, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: pop {r4, r5, r6, r7, pc} +; +; V6M-LABEL: bextr64_d1_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: mov r4, r3 +; V6M-NEXT: uxtb r2, r2 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: movs r2, #64 +; V6M-NEXT: subs r2, r2, r4 +; V6M-NEXT: uxtb r4, r2 +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: pop {r4, pc} + %skip = zext i8 %numskipbits to i64 + %shifted = lshr i64 %val, %skip + %numhighbits = sub i8 64, %numlowbits + %sh_prom = zext i8 %numhighbits to i64 + %highbitscleared = shl i64 %shifted, %sh_prom + %masked = lshr i64 %highbitscleared, %sh_prom + ret i64 %masked +} + +define i64 @bextr64_d2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind { +; V7M-LABEL: bextr64_d2_load: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r4, lr} +; V7M-NEXT: push {r4, lr} +; V7M-NEXT: ldrd r0, r3, [r0] +; V7M-NEXT: rsb.w r1, r2, #32 +; V7M-NEXT: ldr.w r12, [sp, #8] +; V7M-NEXT: lsl.w r1, r3, r1 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: rsb.w lr, r12, #32 +; V7M-NEXT: orrs r0, r1 +; V7M-NEXT: subs.w r1, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r3, r1 +; V7M-NEXT: rsb.w r1, r12, #64 +; V7M-NEXT: lsr.w r2, r3, r2 +; V7M-NEXT: rsb.w r12, r1, #32 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r2, #0 +; V7M-NEXT: cmp.w lr, #0 +; V7M-NEXT: lsl.w r2, r2, r1 +; V7M-NEXT: lsr.w r4, r0, r12 +; V7M-NEXT: orr.w r2, r2, r4 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r2, r0, lr +; V7M-NEXT: lsl.w r0, r0, r1 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r0, #0 +; V7M-NEXT: lsl.w r3, r2, r12 +; V7M-NEXT: lsr.w r0, r0, r1 +; V7M-NEXT: lsr.w r1, r2, r1 +; V7M-NEXT: orr.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r2, lr +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: pop {r4, pc} +; +; V7A-LABEL: bextr64_d2_load: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r11, lr} +; V7A-NEXT: push {r11, lr} +; V7A-NEXT: ldrd r0, r1, [r0] +; V7A-NEXT: subs lr, r2, #32 +; V7A-NEXT: lsr r3, r1, r2 +; V7A-NEXT: ldr r12, [sp, #8] +; V7A-NEXT: movwpl r3, #0 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: rsb r2, r2, #32 +; V7A-NEXT: orr r0, r0, r1, lsl r2 +; V7A-NEXT: lsrpl r0, r1, lr +; V7A-NEXT: rsb r1, r12, #64 +; V7A-NEXT: rsb lr, r1, #32 +; V7A-NEXT: lsr r2, r0, lr +; V7A-NEXT: orr r2, r2, r3, lsl r1 +; V7A-NEXT: rsbs r3, r12, #32 +; V7A-NEXT: lslpl r2, r0, r3 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: movwpl r0, #0 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: lsr r1, r2, r1 +; V7A-NEXT: orr r0, r0, r2, lsl lr +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: lsrpl r0, r2, r3 +; V7A-NEXT: pop {r11, pc} +; +; V7A-T-LABEL: bextr64_d2_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r4, lr} +; V7A-T-NEXT: push {r4, lr} +; V7A-T-NEXT: ldrd r0, r3, [r0] +; V7A-T-NEXT: rsb.w r1, r2, #32 +; V7A-T-NEXT: ldr.w r12, [sp, #8] +; V7A-T-NEXT: lsl.w r1, r3, r1 +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: orrs r0, r1 +; V7A-T-NEXT: subs.w r1, r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r3, r1 +; V7A-T-NEXT: lsr.w r2, r3, r2 +; V7A-T-NEXT: rsb.w r1, r12, #64 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r2, #0 +; V7A-T-NEXT: rsb.w lr, r1, #32 +; V7A-T-NEXT: rsbs.w r3, r12, #32 +; V7A-T-NEXT: lsl.w r2, r2, r1 +; V7A-T-NEXT: lsr.w r4, r0, lr +; V7A-T-NEXT: orr.w r2, r2, r4 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r2, r0, r3 +; V7A-T-NEXT: lsl.w r0, r0, r1 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: lsl.w r4, r2, lr +; V7A-T-NEXT: lsr.w r0, r0, r1 +; V7A-T-NEXT: lsr.w r1, r2, r1 +; V7A-T-NEXT: orr.w r0, r0, r4 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r2, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: pop {r4, pc} +; +; V6M-LABEL: bextr64_d2_load: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: ldr r3, [r0] +; V6M-NEXT: ldr r1, [r0, #4] +; V6M-NEXT: mov r0, r3 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ldr r2, [sp, #8] +; V6M-NEXT: movs r3, #64 +; V6M-NEXT: subs r4, r3, r2 +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: pop {r4, pc} + %val = load i64, ptr %w + %shifted = lshr i64 %val, %numskipbits + %numhighbits = sub i64 64, %numlowbits + %highbitscleared = shl i64 %shifted, %numhighbits + %masked = lshr i64 %highbitscleared, %numhighbits + ret i64 %masked +} + +define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind { +; V7M-LABEL: bextr64_d3_load_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r4, lr} +; V7M-NEXT: push {r4, lr} +; V7M-NEXT: ldrd r0, lr, [r0] +; V7M-NEXT: uxtb r1, r1 +; V7M-NEXT: rsb.w r2, r2, #64 +; V7M-NEXT: subs.w r3, r1, #32 +; V7M-NEXT: lsr.w r12, r0, r1 +; V7M-NEXT: rsb.w r0, r1, #32 +; V7M-NEXT: lsr.w r1, lr, r1 +; V7M-NEXT: uxtb r2, r2 +; V7M-NEXT: lsl.w r0, lr, r0 +; V7M-NEXT: orr.w r0, r0, r12 +; V7M-NEXT: rsb.w r12, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, lr, r3 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: lsls r1, r2 +; V7M-NEXT: sub.w r3, r2, #32 +; V7M-NEXT: lsr.w r4, r0, r12 +; V7M-NEXT: orrs r1, r4 +; V7M-NEXT: cmp r3, #0 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r1, r0, r3 +; V7M-NEXT: lsl.w r0, r0, r2 +; V7M-NEXT: lsl.w r4, r1, r12 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r0, #0 +; V7M-NEXT: lsr.w r0, r0, r2 +; V7M-NEXT: orr.w r0, r0, r4 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r3 +; V7M-NEXT: lsr.w r1, r1, r2 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: pop {r4, pc} +; +; V7A-LABEL: bextr64_d3_load_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, r5, r11, lr} +; V7A-NEXT: push {r4, r5, r11, lr} +; V7A-NEXT: ldr r4, [r0] +; V7A-NEXT: ldr r3, [r0, #4] +; V7A-NEXT: uxtb r0, r1 +; V7A-NEXT: lsr r12, r4, r0 +; V7A-NEXT: rsb r4, r0, #32 +; V7A-NEXT: lsr r0, r3, r0 +; V7A-NEXT: orr r4, r12, r3, lsl r4 +; V7A-NEXT: mvn r12, #31 +; V7A-NEXT: uxtab r1, r12, r1 +; V7A-NEXT: cmp r1, #0 +; V7A-NEXT: lsrpl r4, r3, r1 +; V7A-NEXT: rsb r1, r2, #64 +; V7A-NEXT: movwpl r0, #0 +; V7A-NEXT: uxtb r2, r1 +; V7A-NEXT: rsb lr, r2, #32 +; V7A-NEXT: uxtab r1, r12, r1 +; V7A-NEXT: lsr r5, r4, lr +; V7A-NEXT: orr r3, r5, r0, lsl r2 +; V7A-NEXT: cmp r1, #0 +; V7A-NEXT: lsl r0, r4, r2 +; V7A-NEXT: movwpl r0, #0 +; V7A-NEXT: lslpl r3, r4, r1 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: orr r0, r0, r3, lsl lr +; V7A-NEXT: lsrpl r0, r3, r1 +; V7A-NEXT: lsr r1, r3, r2 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: pop {r4, r5, r11, pc} +; +; V7A-T-LABEL: bextr64_d3_load_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r4, r5, r6, lr} +; V7A-T-NEXT: push {r4, r5, r6, lr} +; V7A-T-NEXT: ldrd r12, lr, [r0] +; V7A-T-NEXT: uxtb r0, r1 +; V7A-T-NEXT: rsb.w r6, r0, #32 +; V7A-T-NEXT: lsr.w r3, lr, r0 +; V7A-T-NEXT: rsb.w r2, r2, #64 +; V7A-T-NEXT: mvn r4, #31 +; V7A-T-NEXT: lsr.w r0, r12, r0 +; V7A-T-NEXT: uxtab r1, r4, r1 +; V7A-T-NEXT: lsl.w r6, lr, r6 +; V7A-T-NEXT: orrs r0, r6 +; V7A-T-NEXT: cmp r1, #0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r3, #0 +; V7A-T-NEXT: uxtb r5, r2 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, lr, r1 +; V7A-T-NEXT: rsb.w r1, r5, #32 +; V7A-T-NEXT: lsls r3, r5 +; V7A-T-NEXT: uxtab r2, r4, r2 +; V7A-T-NEXT: lsr.w r6, r0, r1 +; V7A-T-NEXT: orrs r3, r6 +; V7A-T-NEXT: cmp r2, #0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r3, r0, r2 +; V7A-T-NEXT: lsl.w r0, r0, r5 +; V7A-T-NEXT: lsl.w r1, r3, r1 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: lsr.w r0, r0, r5 +; V7A-T-NEXT: orr.w r0, r0, r1 +; V7A-T-NEXT: lsr.w r1, r3, r5 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r3, r2 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: pop {r4, r5, r6, pc} +; +; V6M-LABEL: bextr64_d3_load_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r7, lr} +; V6M-NEXT: push {r4, r5, r7, lr} +; V6M-NEXT: mov r4, r2 +; V6M-NEXT: ldr r5, [r0] +; V6M-NEXT: ldr r3, [r0, #4] +; V6M-NEXT: uxtb r2, r1 +; V6M-NEXT: mov r0, r5 +; V6M-NEXT: mov r1, r3 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: movs r2, #64 +; V6M-NEXT: subs r2, r2, r4 +; V6M-NEXT: uxtb r4, r2 +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: pop {r4, r5, r7, pc} + %val = load i64, ptr %w + %skip = zext i8 %numskipbits to i64 + %shifted = lshr i64 %val, %skip + %numhighbits = sub i8 64, %numlowbits + %sh_prom = zext i8 %numhighbits to i64 + %highbitscleared = shl i64 %shifted, %sh_prom + %masked = lshr i64 %highbitscleared, %sh_prom + ret i64 %masked +} + +; 64-bit, but with 32-bit output + +; Everything done in 64-bit, truncation happens last. +define i32 @bextr64_32_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { +; V7M-LABEL: bextr64_32_d0: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r4, lr} +; V7M-NEXT: push {r4, lr} +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: ldr.w r12, [sp, #8] +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: lsl.w r3, r1, r3 +; V7M-NEXT: orrs r0, r3 +; V7M-NEXT: subs.w r3, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r3 +; V7M-NEXT: lsr.w r1, r1, r2 +; V7M-NEXT: rsb.w r3, r12, #64 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: rsb.w lr, r12, #32 +; V7M-NEXT: rsb.w r12, r3, #32 +; V7M-NEXT: lsls r1, r3 +; V7M-NEXT: cmp.w lr, #0 +; V7M-NEXT: lsr.w r4, r0, r12 +; V7M-NEXT: orr.w r1, r1, r4 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r1, r0, lr +; V7M-NEXT: lsl.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r0, #0 +; V7M-NEXT: lsl.w r2, r1, r12 +; V7M-NEXT: lsr.w r0, r0, r3 +; V7M-NEXT: orr.w r0, r0, r2 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, lr +; V7M-NEXT: pop {r4, pc} +; +; V7A-LABEL: bextr64_32_d0: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r11, lr} +; V7A-NEXT: push {r11, lr} +; V7A-NEXT: lsr r3, r1, r2 +; V7A-NEXT: subs lr, r2, #32 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: rsb r2, r2, #32 +; V7A-NEXT: ldr r12, [sp, #8] +; V7A-NEXT: movwpl r3, #0 +; V7A-NEXT: orr r0, r0, r1, lsl r2 +; V7A-NEXT: lsrpl r0, r1, lr +; V7A-NEXT: rsb r1, r12, #64 +; V7A-NEXT: rsb lr, r1, #32 +; V7A-NEXT: lsr r2, r0, lr +; V7A-NEXT: orr r2, r2, r3, lsl r1 +; V7A-NEXT: rsbs r3, r12, #32 +; V7A-NEXT: lslpl r2, r0, r3 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: movwpl r0, #0 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: orr r0, r0, r2, lsl lr +; V7A-NEXT: lsrpl r0, r2, r3 +; V7A-NEXT: pop {r11, pc} +; +; V7A-T-LABEL: bextr64_32_d0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r4, lr} +; V7A-T-NEXT: push {r4, lr} +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: ldr.w r12, [sp, #8] +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: lsl.w r3, r1, r3 +; V7A-T-NEXT: orrs r0, r3 +; V7A-T-NEXT: subs.w r3, r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r3 +; V7A-T-NEXT: lsr.w r1, r1, r2 +; V7A-T-NEXT: rsb.w r3, r12, #64 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: rsb.w lr, r3, #32 +; V7A-T-NEXT: lsls r1, r3 +; V7A-T-NEXT: rsbs.w r2, r12, #32 +; V7A-T-NEXT: lsr.w r4, r0, lr +; V7A-T-NEXT: orr.w r1, r1, r4 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r1, r0, r2 +; V7A-T-NEXT: lsl.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: lsl.w r4, r1, lr +; V7A-T-NEXT: lsr.w r0, r0, r3 +; V7A-T-NEXT: orr.w r0, r0, r4 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r2 +; V7A-T-NEXT: pop {r4, pc} +; +; V6M-LABEL: bextr64_32_d0: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ldr r2, [sp, #8] +; V6M-NEXT: movs r3, #64 +; V6M-NEXT: subs r4, r3, r2 +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: pop {r4, pc} + %shifted = lshr i64 %val, %numskipbits + %numhighbits = sub i64 64, %numlowbits + %highbitscleared = shl i64 %shifted, %numhighbits + %masked = lshr i64 %highbitscleared, %numhighbits + %res = trunc i64 %masked to i32 + ret i32 %res +} + +; Shifting happens in 64-bit, then truncation. Masking is 32-bit. +define i32 @bextr64_32_d1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind { +; V7M-LABEL: bextr64_32_d1: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: lsrs r0, r2 +; V7M-NEXT: subs r2, #32 +; V7M-NEXT: lsl.w r3, r1, r3 +; V7M-NEXT: orr.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r2 +; V7M-NEXT: ldr r1, [sp] +; V7M-NEXT: rsb.w r1, r1, #32 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bextr64_32_d1: +; V7A: @ %bb.0: +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: ldr r12, [sp] +; V7A-NEXT: subs r2, r2, #32 +; V7A-NEXT: orr r0, r0, r1, lsl r3 +; V7A-NEXT: lsrpl r0, r1, r2 +; V7A-NEXT: rsb r1, r12, #32 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bextr64_32_d1: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: lsrs r0, r2 +; V7A-T-NEXT: ldr.w r12, [sp] +; V7A-T-NEXT: subs r2, #32 +; V7A-T-NEXT: lsl.w r3, r1, r3 +; V7A-T-NEXT: orr.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r2 +; V7A-T-NEXT: rsb.w r1, r12, #32 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bextr64_32_d1: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r7, lr} +; V6M-NEXT: push {r7, lr} +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ldr r1, [sp, #8] +; V6M-NEXT: movs r2, #32 +; V6M-NEXT: subs r1, r2, r1 +; V6M-NEXT: lsls r0, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: pop {r7, pc} + %shifted = lshr i64 %val, %numskipbits + %truncshifted = trunc i64 %shifted to i32 + %numhighbits = sub i32 32, %numlowbits + %highbitscleared = shl i32 %truncshifted, %numhighbits + %masked = lshr i32 %highbitscleared, %numhighbits + ret i32 %masked +} + +; ---------------------------------------------------------------------------- ; +; Constant +; ---------------------------------------------------------------------------- ; + +; https://bugs.llvm.org/show_bug.cgi?id=38938 +define void @pr38938(ptr %a0, ptr %a1) nounwind { +; V7M-LABEL: pr38938: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r1, [r1] +; V7M-NEXT: ubfx r1, r1, #21, #10 +; V7M-NEXT: ldr.w r2, [r0, r1, lsl #2] +; V7M-NEXT: adds r2, #1 +; V7M-NEXT: str.w r2, [r0, r1, lsl #2] +; V7M-NEXT: bx lr +; +; V7A-LABEL: pr38938: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r1, [r1] +; V7A-NEXT: ubfx r1, r1, #21, #10 +; V7A-NEXT: ldr r2, [r0, r1, lsl #2] +; V7A-NEXT: add r2, r2, #1 +; V7A-NEXT: str r2, [r0, r1, lsl #2] +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: pr38938: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r1, [r1] +; V7A-T-NEXT: ubfx r1, r1, #21, #10 +; V7A-T-NEXT: ldr.w r2, [r0, r1, lsl #2] +; V7A-T-NEXT: adds r2, #1 +; V7A-T-NEXT: str.w r2, [r0, r1, lsl #2] +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: pr38938: +; V6M: @ %bb.0: +; V6M-NEXT: ldr r1, [r1] +; V6M-NEXT: lsrs r1, r1, #19 +; V6M-NEXT: ldr r2, .LCPI51_0 +; V6M-NEXT: ands r2, r1 +; V6M-NEXT: ldr r1, [r0, r2] +; V6M-NEXT: adds r1, r1, #1 +; V6M-NEXT: str r1, [r0, r2] +; V6M-NEXT: bx lr +; V6M-NEXT: .p2align 2 +; V6M-NEXT: @ %bb.1: +; V6M-NEXT: .LCPI51_0: +; V6M-NEXT: .long 4092 @ 0xffc + %tmp = load i64, ptr %a1, align 8 + %tmp1 = lshr i64 %tmp, 21 + %tmp2 = and i64 %tmp1, 1023 + %tmp3 = getelementptr inbounds i32, ptr %a0, i64 %tmp2 + %tmp4 = load i32, ptr %tmp3, align 4 + %tmp5 = add nsw i32 %tmp4, 1 + store i32 %tmp5, ptr %tmp3, align 4 + ret void +} + +; The most canonical variant +define i32 @c0_i32(i32 %arg) nounwind { +; V7M-LABEL: c0_i32: +; V7M: @ %bb.0: +; V7M-NEXT: ubfx r0, r0, #19, #10 +; V7M-NEXT: bx lr +; +; V7A-LABEL: c0_i32: +; V7A: @ %bb.0: +; V7A-NEXT: ubfx r0, r0, #19, #10 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: c0_i32: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ubfx r0, r0, #19, #10 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: c0_i32: +; V6M: @ %bb.0: +; V6M-NEXT: lsls r0, r0, #3 +; V6M-NEXT: lsrs r0, r0, #22 +; V6M-NEXT: bx lr + %tmp0 = lshr i32 %arg, 19 + %tmp1 = and i32 %tmp0, 1023 + ret i32 %tmp1 +} + +; Should be still fine, but the mask is shifted +define i32 @c1_i32(i32 %arg) nounwind { +; V7M-LABEL: c1_i32: +; V7M: @ %bb.0: +; V7M-NEXT: movw r1, #4092 +; V7M-NEXT: and.w r0, r1, r0, lsr #19 +; V7M-NEXT: bx lr +; +; V7A-LABEL: c1_i32: +; V7A: @ %bb.0: +; V7A-NEXT: movw r1, #4092 +; V7A-NEXT: and r0, r1, r0, lsr #19 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: c1_i32: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: movw r1, #4092 +; V7A-T-NEXT: and.w r0, r1, r0, lsr #19 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: c1_i32: +; V6M: @ %bb.0: +; V6M-NEXT: lsrs r1, r0, #19 +; V6M-NEXT: ldr r0, .LCPI53_0 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: bx lr +; V6M-NEXT: .p2align 2 +; V6M-NEXT: @ %bb.1: +; V6M-NEXT: .LCPI53_0: +; V6M-NEXT: .long 4092 @ 0xffc + %tmp0 = lshr i32 %arg, 19 + %tmp1 = and i32 %tmp0, 4092 + ret i32 %tmp1 +} + +; Should be still fine, but the result is shifted left afterwards +define i32 @c2_i32(i32 %arg) nounwind { +; V7M-LABEL: c2_i32: +; V7M: @ %bb.0: +; V7M-NEXT: movw r1, #4092 +; V7M-NEXT: and.w r0, r1, r0, lsr #17 +; V7M-NEXT: bx lr +; +; V7A-LABEL: c2_i32: +; V7A: @ %bb.0: +; V7A-NEXT: movw r1, #4092 +; V7A-NEXT: and r0, r1, r0, lsr #17 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: c2_i32: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: movw r1, #4092 +; V7A-T-NEXT: and.w r0, r1, r0, lsr #17 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: c2_i32: +; V6M: @ %bb.0: +; V6M-NEXT: lsrs r1, r0, #17 +; V6M-NEXT: ldr r0, .LCPI54_0 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: bx lr +; V6M-NEXT: .p2align 2 +; V6M-NEXT: @ %bb.1: +; V6M-NEXT: .LCPI54_0: +; V6M-NEXT: .long 4092 @ 0xffc + %tmp0 = lshr i32 %arg, 19 + %tmp1 = and i32 %tmp0, 1023 + %tmp2 = shl i32 %tmp1, 2 + ret i32 %tmp2 +} + +; The mask covers newly shifted-in bit +define i32 @c4_i32_bad(i32 %arg) nounwind { +; V7M-LABEL: c4_i32_bad: +; V7M: @ %bb.0: +; V7M-NEXT: mvn r1, #1 +; V7M-NEXT: and.w r0, r1, r0, lsr #19 +; V7M-NEXT: bx lr +; +; V7A-LABEL: c4_i32_bad: +; V7A: @ %bb.0: +; V7A-NEXT: mvn r1, #1 +; V7A-NEXT: and r0, r1, r0, lsr #19 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: c4_i32_bad: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: mvn r1, #1 +; V7A-T-NEXT: and.w r0, r1, r0, lsr #19 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: c4_i32_bad: +; V6M: @ %bb.0: +; V6M-NEXT: lsrs r0, r0, #20 +; V6M-NEXT: lsls r0, r0, #1 +; V6M-NEXT: bx lr + %tmp0 = lshr i32 %arg, 19 + %tmp1 = and i32 %tmp0, 16382 + ret i32 %tmp1 +} + +; i64 + +; The most canonical variant +define i64 @c0_i64(i64 %arg) nounwind { +; V7M-LABEL: c0_i64: +; V7M: @ %bb.0: +; V7M-NEXT: ubfx r0, r1, #19, #10 +; V7M-NEXT: movs r1, #0 +; V7M-NEXT: bx lr +; +; V7A-LABEL: c0_i64: +; V7A: @ %bb.0: +; V7A-NEXT: ubfx r0, r1, #19, #10 +; V7A-NEXT: mov r1, #0 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: c0_i64: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ubfx r0, r1, #19, #10 +; V7A-T-NEXT: movs r1, #0 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: c0_i64: +; V6M: @ %bb.0: +; V6M-NEXT: lsls r0, r1, #3 +; V6M-NEXT: lsrs r0, r0, #22 +; V6M-NEXT: movs r1, #0 +; V6M-NEXT: bx lr + %tmp0 = lshr i64 %arg, 51 + %tmp1 = and i64 %tmp0, 1023 + ret i64 %tmp1 +} + +; Should be still fine, but the mask is shifted +define i64 @c1_i64(i64 %arg) nounwind { +; V7M-LABEL: c1_i64: +; V7M: @ %bb.0: +; V7M-NEXT: movw r0, #4092 +; V7M-NEXT: and.w r0, r0, r1, lsr #19 +; V7M-NEXT: movs r1, #0 +; V7M-NEXT: bx lr +; +; V7A-LABEL: c1_i64: +; V7A: @ %bb.0: +; V7A-NEXT: movw r0, #4092 +; V7A-NEXT: and r0, r0, r1, lsr #19 +; V7A-NEXT: mov r1, #0 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: c1_i64: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: movw r0, #4092 +; V7A-T-NEXT: and.w r0, r0, r1, lsr #19 +; V7A-T-NEXT: movs r1, #0 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: c1_i64: +; V6M: @ %bb.0: +; V6M-NEXT: lsrs r1, r1, #19 +; V6M-NEXT: ldr r0, .LCPI57_0 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: movs r1, #0 +; V6M-NEXT: bx lr +; V6M-NEXT: .p2align 2 +; V6M-NEXT: @ %bb.1: +; V6M-NEXT: .LCPI57_0: +; V6M-NEXT: .long 4092 @ 0xffc + %tmp0 = lshr i64 %arg, 51 + %tmp1 = and i64 %tmp0, 4092 + ret i64 %tmp1 +} + +; Should be still fine, but the result is shifted left afterwards +define i64 @c2_i64(i64 %arg) nounwind { +; V7M-LABEL: c2_i64: +; V7M: @ %bb.0: +; V7M-NEXT: movw r0, #4092 +; V7M-NEXT: and.w r0, r0, r1, lsr #17 +; V7M-NEXT: movs r1, #0 +; V7M-NEXT: bx lr +; +; V7A-LABEL: c2_i64: +; V7A: @ %bb.0: +; V7A-NEXT: movw r0, #4092 +; V7A-NEXT: and r0, r0, r1, lsr #17 +; V7A-NEXT: mov r1, #0 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: c2_i64: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: movw r0, #4092 +; V7A-T-NEXT: and.w r0, r0, r1, lsr #17 +; V7A-T-NEXT: movs r1, #0 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: c2_i64: +; V6M: @ %bb.0: +; V6M-NEXT: lsrs r1, r1, #17 +; V6M-NEXT: ldr r0, .LCPI58_0 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: movs r1, #0 +; V6M-NEXT: bx lr +; V6M-NEXT: .p2align 2 +; V6M-NEXT: @ %bb.1: +; V6M-NEXT: .LCPI58_0: +; V6M-NEXT: .long 4092 @ 0xffc + %tmp0 = lshr i64 %arg, 51 + %tmp1 = and i64 %tmp0, 1023 + %tmp2 = shl i64 %tmp1, 2 + ret i64 %tmp2 +} + +; The mask covers newly shifted-in bit +define i64 @c4_i64_bad(i64 %arg) nounwind { +; V7M-LABEL: c4_i64_bad: +; V7M: @ %bb.0: +; V7M-NEXT: mvn r0, #1 +; V7M-NEXT: and.w r0, r0, r1, lsr #19 +; V7M-NEXT: movs r1, #0 +; V7M-NEXT: bx lr +; +; V7A-LABEL: c4_i64_bad: +; V7A: @ %bb.0: +; V7A-NEXT: mvn r0, #1 +; V7A-NEXT: and r0, r0, r1, lsr #19 +; V7A-NEXT: mov r1, #0 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: c4_i64_bad: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: mvn r0, #1 +; V7A-T-NEXT: and.w r0, r0, r1, lsr #19 +; V7A-T-NEXT: movs r1, #0 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: c4_i64_bad: +; V6M: @ %bb.0: +; V6M-NEXT: lsrs r0, r1, #20 +; V6M-NEXT: lsls r0, r0, #1 +; V6M-NEXT: movs r1, #0 +; V6M-NEXT: bx lr + %tmp0 = lshr i64 %arg, 51 + %tmp1 = and i64 %tmp0, 16382 + ret i64 %tmp1 +} + +; ---------------------------------------------------------------------------- ; +; Constant, storing the result afterwards. +; ---------------------------------------------------------------------------- ; + +; i32 + +; The most canonical variant +define void @c5_i32(i32 %arg, ptr %ptr) nounwind { +; V7M-LABEL: c5_i32: +; V7M: @ %bb.0: +; V7M-NEXT: ubfx r0, r0, #19, #10 +; V7M-NEXT: str r0, [r1] +; V7M-NEXT: bx lr +; +; V7A-LABEL: c5_i32: +; V7A: @ %bb.0: +; V7A-NEXT: ubfx r0, r0, #19, #10 +; V7A-NEXT: str r0, [r1] +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: c5_i32: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ubfx r0, r0, #19, #10 +; V7A-T-NEXT: str r0, [r1] +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: c5_i32: +; V6M: @ %bb.0: +; V6M-NEXT: lsls r0, r0, #3 +; V6M-NEXT: lsrs r0, r0, #22 +; V6M-NEXT: str r0, [r1] +; V6M-NEXT: bx lr + %tmp0 = lshr i32 %arg, 19 + %tmp1 = and i32 %tmp0, 1023 + store i32 %tmp1, ptr %ptr + ret void +} + +; Should be still fine, but the mask is shifted +define void @c6_i32(i32 %arg, ptr %ptr) nounwind { +; V7M-LABEL: c6_i32: +; V7M: @ %bb.0: +; V7M-NEXT: ubfx r0, r0, #19, #12 +; V7M-NEXT: str r0, [r1] +; V7M-NEXT: bx lr +; +; V7A-LABEL: c6_i32: +; V7A: @ %bb.0: +; V7A-NEXT: ubfx r0, r0, #19, #12 +; V7A-NEXT: str r0, [r1] +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: c6_i32: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ubfx r0, r0, #19, #12 +; V7A-T-NEXT: str r0, [r1] +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: c6_i32: +; V6M: @ %bb.0: +; V6M-NEXT: lsls r0, r0, #1 +; V6M-NEXT: lsrs r0, r0, #20 +; V6M-NEXT: str r0, [r1] +; V6M-NEXT: bx lr + %tmp0 = lshr i32 %arg, 19 + %tmp1 = and i32 %tmp0, 4095 + store i32 %tmp1, ptr %ptr + ret void +} + +; Should be still fine, but the result is shifted left afterwards +define void @c7_i32(i32 %arg, ptr %ptr) nounwind { +; V7M-LABEL: c7_i32: +; V7M: @ %bb.0: +; V7M-NEXT: movw r2, #4092 +; V7M-NEXT: and.w r0, r2, r0, lsr #17 +; V7M-NEXT: str r0, [r1] +; V7M-NEXT: bx lr +; +; V7A-LABEL: c7_i32: +; V7A: @ %bb.0: +; V7A-NEXT: movw r2, #4092 +; V7A-NEXT: and r0, r2, r0, lsr #17 +; V7A-NEXT: str r0, [r1] +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: c7_i32: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: movw r2, #4092 +; V7A-T-NEXT: and.w r0, r2, r0, lsr #17 +; V7A-T-NEXT: str r0, [r1] +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: c7_i32: +; V6M: @ %bb.0: +; V6M-NEXT: lsrs r0, r0, #17 +; V6M-NEXT: ldr r2, .LCPI62_0 +; V6M-NEXT: ands r2, r0 +; V6M-NEXT: str r2, [r1] +; V6M-NEXT: bx lr +; V6M-NEXT: .p2align 2 +; V6M-NEXT: @ %bb.1: +; V6M-NEXT: .LCPI62_0: +; V6M-NEXT: .long 4092 @ 0xffc + %tmp0 = lshr i32 %arg, 19 + %tmp1 = and i32 %tmp0, 1023 + %tmp2 = shl i32 %tmp1, 2 + store i32 %tmp2, ptr %ptr + ret void +} + +; i64 + +; The most canonical variant +define void @c5_i64(i64 %arg, ptr %ptr) nounwind { +; V7M-LABEL: c5_i64: +; V7M: @ %bb.0: +; V7M-NEXT: movs r0, #0 +; V7M-NEXT: ubfx r1, r1, #19, #10 +; V7M-NEXT: strd r1, r0, [r2] +; V7M-NEXT: bx lr +; +; V7A-LABEL: c5_i64: +; V7A: @ %bb.0: +; V7A-NEXT: mov r0, #0 +; V7A-NEXT: str r0, [r2, #4] +; V7A-NEXT: ubfx r0, r1, #19, #10 +; V7A-NEXT: str r0, [r2] +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: c5_i64: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: movs r0, #0 +; V7A-T-NEXT: ubfx r1, r1, #19, #10 +; V7A-T-NEXT: strd r1, r0, [r2] +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: c5_i64: +; V6M: @ %bb.0: +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: lsls r1, r1, #3 +; V6M-NEXT: lsrs r1, r1, #22 +; V6M-NEXT: str r1, [r2] +; V6M-NEXT: str r0, [r2, #4] +; V6M-NEXT: bx lr + %tmp0 = lshr i64 %arg, 51 + %tmp1 = and i64 %tmp0, 1023 + store i64 %tmp1, ptr %ptr + ret void +} + +; Should be still fine, but the mask is shifted +define void @c6_i64(i64 %arg, ptr %ptr) nounwind { +; V7M-LABEL: c6_i64: +; V7M: @ %bb.0: +; V7M-NEXT: movs r0, #0 +; V7M-NEXT: ubfx r1, r1, #19, #12 +; V7M-NEXT: strd r1, r0, [r2] +; V7M-NEXT: bx lr +; +; V7A-LABEL: c6_i64: +; V7A: @ %bb.0: +; V7A-NEXT: mov r0, #0 +; V7A-NEXT: str r0, [r2, #4] +; V7A-NEXT: ubfx r0, r1, #19, #12 +; V7A-NEXT: str r0, [r2] +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: c6_i64: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: movs r0, #0 +; V7A-T-NEXT: ubfx r1, r1, #19, #12 +; V7A-T-NEXT: strd r1, r0, [r2] +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: c6_i64: +; V6M: @ %bb.0: +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: lsls r1, r1, #1 +; V6M-NEXT: lsrs r1, r1, #20 +; V6M-NEXT: str r1, [r2] +; V6M-NEXT: str r0, [r2, #4] +; V6M-NEXT: bx lr + %tmp0 = lshr i64 %arg, 51 + %tmp1 = and i64 %tmp0, 4095 + store i64 %tmp1, ptr %ptr + ret void +} + +; Should be still fine, but the result is shifted left afterwards +define void @c7_i64(i64 %arg, ptr %ptr) nounwind { +; V7M-LABEL: c7_i64: +; V7M: @ %bb.0: +; V7M-NEXT: movs r0, #0 +; V7M-NEXT: movw r3, #4092 +; V7M-NEXT: and.w r1, r3, r1, lsr #17 +; V7M-NEXT: strd r1, r0, [r2] +; V7M-NEXT: bx lr +; +; V7A-LABEL: c7_i64: +; V7A: @ %bb.0: +; V7A-NEXT: movw r0, #4092 +; V7A-NEXT: mov r3, #0 +; V7A-NEXT: and r0, r0, r1, lsr #17 +; V7A-NEXT: stm r2, {r0, r3} +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: c7_i64: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: movs r0, #0 +; V7A-T-NEXT: movw r3, #4092 +; V7A-T-NEXT: and.w r1, r3, r1, lsr #17 +; V7A-T-NEXT: strd r1, r0, [r2] +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: c7_i64: +; V6M: @ %bb.0: +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: lsrs r1, r1, #17 +; V6M-NEXT: ldr r3, .LCPI65_0 +; V6M-NEXT: ands r3, r1 +; V6M-NEXT: str r3, [r2] +; V6M-NEXT: str r0, [r2, #4] +; V6M-NEXT: bx lr +; V6M-NEXT: .p2align 2 +; V6M-NEXT: @ %bb.1: +; V6M-NEXT: .LCPI65_0: +; V6M-NEXT: .long 4092 @ 0xffc + %tmp0 = lshr i64 %arg, 51 + %tmp1 = and i64 %tmp0, 1023 + %tmp2 = shl i64 %tmp1, 2 + store i64 %tmp2, ptr %ptr + ret void +} diff --git a/llvm/test/CodeGen/ARM/extract-lowbits.ll b/llvm/test/CodeGen/ARM/extract-lowbits.ll new file mode 100644 index 0000000..b483793 --- /dev/null +++ b/llvm/test/CodeGen/ARM/extract-lowbits.ll @@ -0,0 +1,2752 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s --check-prefix V7M +; RUN: llc -mtriple=armv7a-eabi %s -o - | FileCheck %s --check-prefix V7A +; RUN: llc -mtriple=thumbv7a-eabi %s -o - | FileCheck %s --check-prefix V7A-T +; RUN: llc -mtriple=armv6m-eabi %s -o - | FileCheck %s --check-prefix V6M + +; Patterns: +; a) x & (1 << nbits) - 1 +; b) x & ~(-1 << nbits) +; c) x & (-1 >> (32 - y)) +; d) x << (32 - y) >> (32 - y) +; are equivalent. + +; ---------------------------------------------------------------------------- ; +; Pattern a. 32-bit +; ---------------------------------------------------------------------------- ; + +define i32 @bzhi32_a0(i32 %val, i32 %numlowbits) nounwind { +; V7M-LABEL: bzhi32_a0: +; V7M: @ %bb.0: +; V7M-NEXT: movs r2, #1 +; V7M-NEXT: lsl.w r1, r2, r1 +; V7M-NEXT: subs r1, #1 +; V7M-NEXT: ands r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_a0: +; V7A: @ %bb.0: +; V7A-NEXT: mov r2, #1 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: add r1, r3, r2, lsl r1 +; V7A-NEXT: and r0, r1, r0 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_a0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: movs r2, #1 +; V7A-T-NEXT: lsl.w r1, r2, r1 +; V7A-T-NEXT: subs r1, #1 +; V7A-T-NEXT: ands r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_a0: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #1 +; V6M-NEXT: lsls r2, r1 +; V6M-NEXT: subs r1, r2, #1 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: bx lr + %onebit = shl i32 1, %numlowbits + %mask = add nsw i32 %onebit, -1 + %masked = and i32 %mask, %val + ret i32 %masked +} + +define i32 @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind { +; V7M-LABEL: bzhi32_a1_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: movs r2, #1 +; V7M-NEXT: lsl.w r1, r2, r1 +; V7M-NEXT: subs r1, #1 +; V7M-NEXT: ands r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_a1_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: mov r2, #1 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: add r1, r3, r2, lsl r1 +; V7A-NEXT: and r0, r1, r0 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_a1_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: movs r2, #1 +; V7A-T-NEXT: lsl.w r1, r2, r1 +; V7A-T-NEXT: subs r1, #1 +; V7A-T-NEXT: ands r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_a1_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #1 +; V6M-NEXT: lsls r2, r1 +; V6M-NEXT: subs r1, r2, #1 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: bx lr + %conv = zext i8 %numlowbits to i32 + %onebit = shl i32 1, %conv + %mask = add nsw i32 %onebit, -1 + %masked = and i32 %mask, %val + ret i32 %masked +} + +define i32 @bzhi32_a2_load(ptr %w, i32 %numlowbits) nounwind { +; V7M-LABEL: bzhi32_a2_load: +; V7M: @ %bb.0: +; V7M-NEXT: movs r2, #1 +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: lsl.w r1, r2, r1 +; V7M-NEXT: subs r1, #1 +; V7M-NEXT: ands r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_a2_load: +; V7A: @ %bb.0: +; V7A-NEXT: mov r2, #1 +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: add r1, r3, r2, lsl r1 +; V7A-NEXT: and r0, r1, r0 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_a2_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: movs r2, #1 +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: lsl.w r1, r2, r1 +; V7A-T-NEXT: subs r1, #1 +; V7A-T-NEXT: ands r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_a2_load: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #1 +; V6M-NEXT: lsls r2, r1 +; V6M-NEXT: subs r1, r2, #1 +; V6M-NEXT: ldr r0, [r0] +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: bx lr + %val = load i32, ptr %w + %onebit = shl i32 1, %numlowbits + %mask = add nsw i32 %onebit, -1 + %masked = and i32 %mask, %val + ret i32 %masked +} + +define i32 @bzhi32_a3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind { +; V7M-LABEL: bzhi32_a3_load_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: movs r2, #1 +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: lsl.w r1, r2, r1 +; V7M-NEXT: subs r1, #1 +; V7M-NEXT: ands r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_a3_load_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: mov r2, #1 +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: add r1, r3, r2, lsl r1 +; V7A-NEXT: and r0, r1, r0 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_a3_load_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: movs r2, #1 +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: lsl.w r1, r2, r1 +; V7A-T-NEXT: subs r1, #1 +; V7A-T-NEXT: ands r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_a3_load_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #1 +; V6M-NEXT: lsls r2, r1 +; V6M-NEXT: subs r1, r2, #1 +; V6M-NEXT: ldr r0, [r0] +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: bx lr + %val = load i32, ptr %w + %conv = zext i8 %numlowbits to i32 + %onebit = shl i32 1, %conv + %mask = add nsw i32 %onebit, -1 + %masked = and i32 %mask, %val + ret i32 %masked +} + +define i32 @bzhi32_a4_commutative(i32 %val, i32 %numlowbits) nounwind { +; V7M-LABEL: bzhi32_a4_commutative: +; V7M: @ %bb.0: +; V7M-NEXT: movs r2, #1 +; V7M-NEXT: lsl.w r1, r2, r1 +; V7M-NEXT: subs r1, #1 +; V7M-NEXT: ands r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_a4_commutative: +; V7A: @ %bb.0: +; V7A-NEXT: mov r2, #1 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: add r1, r3, r2, lsl r1 +; V7A-NEXT: and r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_a4_commutative: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: movs r2, #1 +; V7A-T-NEXT: lsl.w r1, r2, r1 +; V7A-T-NEXT: subs r1, #1 +; V7A-T-NEXT: ands r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_a4_commutative: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #1 +; V6M-NEXT: lsls r2, r1 +; V6M-NEXT: subs r1, r2, #1 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: bx lr + %onebit = shl i32 1, %numlowbits + %mask = add nsw i32 %onebit, -1 + %masked = and i32 %val, %mask ; swapped order + ret i32 %masked +} + +; 64-bit + +define i64 @bzhi64_a0(i64 %val, i64 %numlowbits) nounwind { +; V7M-LABEL: bzhi64_a0: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: mov.w r12, #1 +; V7M-NEXT: subs.w lr, r2, #32 +; V7M-NEXT: lsl.w r2, r12, r2 +; V7M-NEXT: lsr.w r3, r12, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r3, r12, lr +; V7M-NEXT: it pl +; V7M-NEXT: movpl r2, #0 +; V7M-NEXT: subs r2, #1 +; V7M-NEXT: sbc r3, r3, #0 +; V7M-NEXT: ands r0, r2 +; V7M-NEXT: ands r1, r3 +; V7M-NEXT: pop {r7, pc} +; +; V7A-LABEL: bzhi64_a0: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r11, lr} +; V7A-NEXT: push {r11, lr} +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: mov r12, #1 +; V7A-NEXT: lsr lr, r12, r3 +; V7A-NEXT: subs r3, r2, #32 +; V7A-NEXT: lsl r2, r12, r2 +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: lslpl lr, r12, r3 +; V7A-NEXT: subs r2, r2, #1 +; V7A-NEXT: sbc r3, lr, #0 +; V7A-NEXT: and r0, r2, r0 +; V7A-NEXT: and r1, r3, r1 +; V7A-NEXT: pop {r11, pc} +; +; V7A-T-LABEL: bzhi64_a0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r7, lr} +; V7A-T-NEXT: push {r7, lr} +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: mov.w r12, #1 +; V7A-T-NEXT: subs.w lr, r2, #32 +; V7A-T-NEXT: lsl.w r2, r12, r2 +; V7A-T-NEXT: lsr.w r3, r12, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r3, r12, lr +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r2, #0 +; V7A-T-NEXT: subs r2, #1 +; V7A-T-NEXT: sbc r3, r3, #0 +; V7A-T-NEXT: ands r0, r2 +; V7A-T-NEXT: ands r1, r3 +; V7A-T-NEXT: pop {r7, pc} +; +; V6M-LABEL: bzhi64_a0: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r6, lr} +; V6M-NEXT: push {r4, r5, r6, lr} +; V6M-NEXT: mov r5, r1 +; V6M-NEXT: mov r4, r0 +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r6, #0 +; V6M-NEXT: mov r1, r6 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: subs r0, r0, #1 +; V6M-NEXT: sbcs r1, r6 +; V6M-NEXT: ands r1, r5 +; V6M-NEXT: ands r0, r4 +; V6M-NEXT: pop {r4, r5, r6, pc} + %onebit = shl i64 1, %numlowbits + %mask = add nsw i64 %onebit, -1 + %masked = and i64 %mask, %val + ret i64 %masked +} + +; Check that we don't throw away the vreg_width-1 mask if not using shifts +define i64 @bzhi64_a0_masked(i64 %val, i64 %numlowbits) nounwind { +; V7M-LABEL: bzhi64_a0_masked: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: and r2, r2, #63 +; V7M-NEXT: mov.w r12, #1 +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: subs.w lr, r2, #32 +; V7M-NEXT: lsl.w r2, r12, r2 +; V7M-NEXT: lsr.w r3, r12, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r3, r12, lr +; V7M-NEXT: it pl +; V7M-NEXT: movpl r2, #0 +; V7M-NEXT: subs r2, #1 +; V7M-NEXT: sbc r3, r3, #0 +; V7M-NEXT: ands r0, r2 +; V7M-NEXT: ands r1, r3 +; V7M-NEXT: pop {r7, pc} +; +; V7A-LABEL: bzhi64_a0_masked: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r11, lr} +; V7A-NEXT: push {r11, lr} +; V7A-NEXT: and r2, r2, #63 +; V7A-NEXT: mov r12, #1 +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: lsr lr, r12, r3 +; V7A-NEXT: subs r3, r2, #32 +; V7A-NEXT: lsl r2, r12, r2 +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: lslpl lr, r12, r3 +; V7A-NEXT: subs r2, r2, #1 +; V7A-NEXT: sbc r3, lr, #0 +; V7A-NEXT: and r0, r2, r0 +; V7A-NEXT: and r1, r3, r1 +; V7A-NEXT: pop {r11, pc} +; +; V7A-T-LABEL: bzhi64_a0_masked: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r7, lr} +; V7A-T-NEXT: push {r7, lr} +; V7A-T-NEXT: and r2, r2, #63 +; V7A-T-NEXT: mov.w r12, #1 +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: subs.w lr, r2, #32 +; V7A-T-NEXT: lsl.w r2, r12, r2 +; V7A-T-NEXT: lsr.w r3, r12, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r3, r12, lr +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r2, #0 +; V7A-T-NEXT: subs r2, #1 +; V7A-T-NEXT: sbc r3, r3, #0 +; V7A-T-NEXT: ands r0, r2 +; V7A-T-NEXT: ands r1, r3 +; V7A-T-NEXT: pop {r7, pc} +; +; V6M-LABEL: bzhi64_a0_masked: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r6, lr} +; V6M-NEXT: push {r4, r5, r6, lr} +; V6M-NEXT: mov r5, r1 +; V6M-NEXT: mov r4, r0 +; V6M-NEXT: movs r0, #63 +; V6M-NEXT: ands r2, r0 +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r6, #0 +; V6M-NEXT: mov r1, r6 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: subs r0, r0, #1 +; V6M-NEXT: sbcs r1, r6 +; V6M-NEXT: ands r1, r5 +; V6M-NEXT: ands r0, r4 +; V6M-NEXT: pop {r4, r5, r6, pc} + %numlowbits.masked = and i64 %numlowbits, 63 + %onebit = shl i64 1, %numlowbits.masked + %mask = add nsw i64 %onebit, -1 + %masked = and i64 %mask, %val + ret i64 %masked +} + +define i64 @bzhi64_a1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind { +; V7M-LABEL: bzhi64_a1_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: mov.w r12, #1 +; V7M-NEXT: subs.w lr, r2, #32 +; V7M-NEXT: lsl.w r2, r12, r2 +; V7M-NEXT: lsr.w r3, r12, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r3, r12, lr +; V7M-NEXT: it pl +; V7M-NEXT: movpl r2, #0 +; V7M-NEXT: subs r2, #1 +; V7M-NEXT: sbc r3, r3, #0 +; V7M-NEXT: ands r0, r2 +; V7M-NEXT: ands r1, r3 +; V7M-NEXT: pop {r7, pc} +; +; V7A-LABEL: bzhi64_a1_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r11, lr} +; V7A-NEXT: push {r11, lr} +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: mov r12, #1 +; V7A-NEXT: lsr lr, r12, r3 +; V7A-NEXT: subs r3, r2, #32 +; V7A-NEXT: lsl r2, r12, r2 +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: lslpl lr, r12, r3 +; V7A-NEXT: subs r2, r2, #1 +; V7A-NEXT: sbc r3, lr, #0 +; V7A-NEXT: and r0, r2, r0 +; V7A-NEXT: and r1, r3, r1 +; V7A-NEXT: pop {r11, pc} +; +; V7A-T-LABEL: bzhi64_a1_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r7, lr} +; V7A-T-NEXT: push {r7, lr} +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: mov.w r12, #1 +; V7A-T-NEXT: subs.w lr, r2, #32 +; V7A-T-NEXT: lsl.w r2, r12, r2 +; V7A-T-NEXT: lsr.w r3, r12, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r3, r12, lr +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r2, #0 +; V7A-T-NEXT: subs r2, #1 +; V7A-T-NEXT: sbc r3, r3, #0 +; V7A-T-NEXT: ands r0, r2 +; V7A-T-NEXT: ands r1, r3 +; V7A-T-NEXT: pop {r7, pc} +; +; V6M-LABEL: bzhi64_a1_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r6, lr} +; V6M-NEXT: push {r4, r5, r6, lr} +; V6M-NEXT: mov r5, r1 +; V6M-NEXT: mov r4, r0 +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r6, #0 +; V6M-NEXT: mov r1, r6 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: subs r0, r0, #1 +; V6M-NEXT: sbcs r1, r6 +; V6M-NEXT: ands r1, r5 +; V6M-NEXT: ands r0, r4 +; V6M-NEXT: pop {r4, r5, r6, pc} + %conv = zext i8 %numlowbits to i64 + %onebit = shl i64 1, %conv + %mask = add nsw i64 %onebit, -1 + %masked = and i64 %mask, %val + ret i64 %masked +} + +define i64 @bzhi64_a2_load(ptr %w, i64 %numlowbits) nounwind { +; V7M-LABEL: bzhi64_a2_load: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r1, r2, #32 +; V7M-NEXT: movs r3, #1 +; V7M-NEXT: subs.w r12, r2, #32 +; V7M-NEXT: lsl.w r2, r3, r2 +; V7M-NEXT: lsr.w r1, r3, r1 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r1, r3, r12 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r2, #0 +; V7M-NEXT: subs r2, #1 +; V7M-NEXT: ldrd r0, r3, [r0] +; V7M-NEXT: sbc r1, r1, #0 +; V7M-NEXT: ands r1, r3 +; V7M-NEXT: ands r0, r2 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_a2_load: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, r6, r11, lr} +; V7A-NEXT: push {r4, r6, r11, lr} +; V7A-NEXT: ldr r6, [r0] +; V7A-NEXT: mov r1, #1 +; V7A-NEXT: ldr r3, [r0, #4] +; V7A-NEXT: rsb r0, r2, #32 +; V7A-NEXT: subs r4, r2, #32 +; V7A-NEXT: lsr r0, r1, r0 +; V7A-NEXT: lslpl r0, r1, r4 +; V7A-NEXT: lsl r1, r1, r2 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: subs r2, r1, #1 +; V7A-NEXT: sbc r0, r0, #0 +; V7A-NEXT: and r1, r0, r3 +; V7A-NEXT: and r0, r2, r6 +; V7A-NEXT: pop {r4, r6, r11, pc} +; +; V7A-T-LABEL: bzhi64_a2_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r7, lr} +; V7A-T-NEXT: push {r7, lr} +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: movs r1, #1 +; V7A-T-NEXT: ldrd r12, lr, [r0] +; V7A-T-NEXT: subs.w r0, r2, #32 +; V7A-T-NEXT: lsr.w r3, r1, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r3, r1, r0 +; V7A-T-NEXT: lsl.w r0, r1, r2 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: subs r0, #1 +; V7A-T-NEXT: sbc r1, r3, #0 +; V7A-T-NEXT: and.w r0, r0, r12 +; V7A-T-NEXT: and.w r1, r1, lr +; V7A-T-NEXT: pop {r7, pc} +; +; V6M-LABEL: bzhi64_a2_load: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r7, lr} +; V6M-NEXT: push {r4, r5, r7, lr} +; V6M-NEXT: mov r4, r0 +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r5, #0 +; V6M-NEXT: mov r1, r5 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: subs r2, r0, #1 +; V6M-NEXT: sbcs r1, r5 +; V6M-NEXT: ldm r4!, {r0, r3} +; V6M-NEXT: ands r1, r3 +; V6M-NEXT: ands r0, r2 +; V6M-NEXT: pop {r4, r5, r7, pc} + %val = load i64, ptr %w + %onebit = shl i64 1, %numlowbits + %mask = add nsw i64 %onebit, -1 + %masked = and i64 %mask, %val + ret i64 %masked +} + +define i64 @bzhi64_a3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind { +; V7M-LABEL: bzhi64_a3_load_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r2, r1, #32 +; V7M-NEXT: movs r3, #1 +; V7M-NEXT: subs.w r12, r1, #32 +; V7M-NEXT: lsl.w r1, r3, r1 +; V7M-NEXT: lsr.w r2, r3, r2 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r2, r3, r12 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: subs r3, r1, #1 +; V7M-NEXT: sbc r1, r2, #0 +; V7M-NEXT: ldrd r0, r2, [r0] +; V7M-NEXT: ands r1, r2 +; V7M-NEXT: ands r0, r3 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_a3_load_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, r6, r11, lr} +; V7A-NEXT: push {r4, r6, r11, lr} +; V7A-NEXT: ldr r6, [r0] +; V7A-NEXT: mov r2, #1 +; V7A-NEXT: ldr r3, [r0, #4] +; V7A-NEXT: rsb r0, r1, #32 +; V7A-NEXT: subs r4, r1, #32 +; V7A-NEXT: lsl r1, r2, r1 +; V7A-NEXT: lsr r0, r2, r0 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: lslpl r0, r2, r4 +; V7A-NEXT: subs r2, r1, #1 +; V7A-NEXT: sbc r0, r0, #0 +; V7A-NEXT: and r1, r0, r3 +; V7A-NEXT: and r0, r2, r6 +; V7A-NEXT: pop {r4, r6, r11, pc} +; +; V7A-T-LABEL: bzhi64_a3_load_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r7, lr} +; V7A-T-NEXT: push {r7, lr} +; V7A-T-NEXT: rsb.w r3, r1, #32 +; V7A-T-NEXT: movs r2, #1 +; V7A-T-NEXT: ldrd r12, lr, [r0] +; V7A-T-NEXT: subs.w r0, r1, #32 +; V7A-T-NEXT: lsr.w r3, r2, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r3, r2, r0 +; V7A-T-NEXT: lsl.w r0, r2, r1 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: subs r0, #1 +; V7A-T-NEXT: sbc r1, r3, #0 +; V7A-T-NEXT: and.w r0, r0, r12 +; V7A-T-NEXT: and.w r1, r1, lr +; V7A-T-NEXT: pop {r7, pc} +; +; V6M-LABEL: bzhi64_a3_load_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r7, lr} +; V6M-NEXT: push {r4, r5, r7, lr} +; V6M-NEXT: mov r2, r1 +; V6M-NEXT: mov r4, r0 +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r5, #0 +; V6M-NEXT: mov r1, r5 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: subs r2, r0, #1 +; V6M-NEXT: sbcs r1, r5 +; V6M-NEXT: ldm r4!, {r0, r3} +; V6M-NEXT: ands r1, r3 +; V6M-NEXT: ands r0, r2 +; V6M-NEXT: pop {r4, r5, r7, pc} + %val = load i64, ptr %w + %conv = zext i8 %numlowbits to i64 + %onebit = shl i64 1, %conv + %mask = add nsw i64 %onebit, -1 + %masked = and i64 %mask, %val + ret i64 %masked +} + +define i64 @bzhi64_a4_commutative(i64 %val, i64 %numlowbits) nounwind { +; V7M-LABEL: bzhi64_a4_commutative: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: mov.w r12, #1 +; V7M-NEXT: subs.w lr, r2, #32 +; V7M-NEXT: lsl.w r2, r12, r2 +; V7M-NEXT: lsr.w r3, r12, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r3, r12, lr +; V7M-NEXT: it pl +; V7M-NEXT: movpl r2, #0 +; V7M-NEXT: subs r2, #1 +; V7M-NEXT: sbc r3, r3, #0 +; V7M-NEXT: ands r0, r2 +; V7M-NEXT: ands r1, r3 +; V7M-NEXT: pop {r7, pc} +; +; V7A-LABEL: bzhi64_a4_commutative: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r11, lr} +; V7A-NEXT: push {r11, lr} +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: mov r12, #1 +; V7A-NEXT: lsr lr, r12, r3 +; V7A-NEXT: subs r3, r2, #32 +; V7A-NEXT: lsl r2, r12, r2 +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: lslpl lr, r12, r3 +; V7A-NEXT: subs r2, r2, #1 +; V7A-NEXT: sbc r3, lr, #0 +; V7A-NEXT: and r0, r0, r2 +; V7A-NEXT: and r1, r1, r3 +; V7A-NEXT: pop {r11, pc} +; +; V7A-T-LABEL: bzhi64_a4_commutative: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r7, lr} +; V7A-T-NEXT: push {r7, lr} +; V7A-T-NEXT: rsb.w r3, r2, #32 +; V7A-T-NEXT: mov.w r12, #1 +; V7A-T-NEXT: subs.w lr, r2, #32 +; V7A-T-NEXT: lsl.w r2, r12, r2 +; V7A-T-NEXT: lsr.w r3, r12, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r3, r12, lr +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r2, #0 +; V7A-T-NEXT: subs r2, #1 +; V7A-T-NEXT: sbc r3, r3, #0 +; V7A-T-NEXT: ands r0, r2 +; V7A-T-NEXT: ands r1, r3 +; V7A-T-NEXT: pop {r7, pc} +; +; V6M-LABEL: bzhi64_a4_commutative: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r6, lr} +; V6M-NEXT: push {r4, r5, r6, lr} +; V6M-NEXT: mov r5, r1 +; V6M-NEXT: mov r4, r0 +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r6, #0 +; V6M-NEXT: mov r1, r6 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: subs r0, r0, #1 +; V6M-NEXT: sbcs r1, r6 +; V6M-NEXT: ands r1, r5 +; V6M-NEXT: ands r0, r4 +; V6M-NEXT: pop {r4, r5, r6, pc} + %onebit = shl i64 1, %numlowbits + %mask = add nsw i64 %onebit, -1 + %masked = and i64 %val, %mask ; swapped order + ret i64 %masked +} + +; ---------------------------------------------------------------------------- ; +; Pattern b. 32-bit +; ---------------------------------------------------------------------------- ; + +define i32 @bzhi32_b0(i32 %val, i32 %numlowbits) nounwind { +; V7M-LABEL: bzhi32_b0: +; V7M: @ %bb.0: +; V7M-NEXT: mov.w r2, #-1 +; V7M-NEXT: lsl.w r1, r2, r1 +; V7M-NEXT: bics r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_b0: +; V7A: @ %bb.0: +; V7A-NEXT: mvn r2, #0 +; V7A-NEXT: bic r0, r0, r2, lsl r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_b0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: mov.w r2, #-1 +; V7A-T-NEXT: lsl.w r1, r2, r1 +; V7A-T-NEXT: bics r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_b0: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #0 +; V6M-NEXT: mvns r2, r2 +; V6M-NEXT: lsls r2, r1 +; V6M-NEXT: bics r0, r2 +; V6M-NEXT: bx lr + %notmask = shl i32 -1, %numlowbits + %mask = xor i32 %notmask, -1 + %masked = and i32 %mask, %val + ret i32 %masked +} + +define i32 @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind { +; V7M-LABEL: bzhi32_b1_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: mov.w r2, #-1 +; V7M-NEXT: lsl.w r1, r2, r1 +; V7M-NEXT: bics r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_b1_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: mvn r2, #0 +; V7A-NEXT: bic r0, r0, r2, lsl r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_b1_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: mov.w r2, #-1 +; V7A-T-NEXT: lsl.w r1, r2, r1 +; V7A-T-NEXT: bics r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_b1_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #0 +; V6M-NEXT: mvns r2, r2 +; V6M-NEXT: lsls r2, r1 +; V6M-NEXT: bics r0, r2 +; V6M-NEXT: bx lr + %conv = zext i8 %numlowbits to i32 + %notmask = shl i32 -1, %conv + %mask = xor i32 %notmask, -1 + %masked = and i32 %mask, %val + ret i32 %masked +} + +define i32 @bzhi32_b2_load(ptr %w, i32 %numlowbits) nounwind { +; V7M-LABEL: bzhi32_b2_load: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: mov.w r2, #-1 +; V7M-NEXT: lsl.w r1, r2, r1 +; V7M-NEXT: bics r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_b2_load: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: mvn r2, #0 +; V7A-NEXT: bic r0, r0, r2, lsl r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_b2_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: mov.w r2, #-1 +; V7A-T-NEXT: lsl.w r1, r2, r1 +; V7A-T-NEXT: bics r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_b2_load: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #0 +; V6M-NEXT: mvns r2, r2 +; V6M-NEXT: lsls r2, r1 +; V6M-NEXT: ldr r0, [r0] +; V6M-NEXT: bics r0, r2 +; V6M-NEXT: bx lr + %val = load i32, ptr %w + %notmask = shl i32 -1, %numlowbits + %mask = xor i32 %notmask, -1 + %masked = and i32 %mask, %val + ret i32 %masked +} + +define i32 @bzhi32_b3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind { +; V7M-LABEL: bzhi32_b3_load_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: mov.w r2, #-1 +; V7M-NEXT: lsl.w r1, r2, r1 +; V7M-NEXT: bics r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_b3_load_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: mvn r2, #0 +; V7A-NEXT: bic r0, r0, r2, lsl r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_b3_load_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: mov.w r2, #-1 +; V7A-T-NEXT: lsl.w r1, r2, r1 +; V7A-T-NEXT: bics r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_b3_load_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #0 +; V6M-NEXT: mvns r2, r2 +; V6M-NEXT: lsls r2, r1 +; V6M-NEXT: ldr r0, [r0] +; V6M-NEXT: bics r0, r2 +; V6M-NEXT: bx lr + %val = load i32, ptr %w + %conv = zext i8 %numlowbits to i32 + %notmask = shl i32 -1, %conv + %mask = xor i32 %notmask, -1 + %masked = and i32 %mask, %val + ret i32 %masked +} + +define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind { +; V7M-LABEL: bzhi32_b4_commutative: +; V7M: @ %bb.0: +; V7M-NEXT: mov.w r2, #-1 +; V7M-NEXT: lsl.w r1, r2, r1 +; V7M-NEXT: bics r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_b4_commutative: +; V7A: @ %bb.0: +; V7A-NEXT: mvn r2, #0 +; V7A-NEXT: bic r0, r0, r2, lsl r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_b4_commutative: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: mov.w r2, #-1 +; V7A-T-NEXT: lsl.w r1, r2, r1 +; V7A-T-NEXT: bics r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_b4_commutative: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #0 +; V6M-NEXT: mvns r2, r2 +; V6M-NEXT: lsls r2, r1 +; V6M-NEXT: bics r0, r2 +; V6M-NEXT: bx lr + %notmask = shl i32 -1, %numlowbits + %mask = xor i32 %notmask, -1 + %masked = and i32 %val, %mask ; swapped order + ret i32 %masked +} + +; 64-bit + +define i64 @bzhi64_b0(i64 %val, i64 %numlowbits) nounwind { +; V7M-LABEL: bzhi64_b0: +; V7M: @ %bb.0: +; V7M-NEXT: mov.w r3, #-1 +; V7M-NEXT: lsl.w r12, r3, r2 +; V7M-NEXT: subs r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: movpl.w r12, #0 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl r3, r2 +; V7M-NEXT: bic.w r0, r0, r12 +; V7M-NEXT: bics r1, r3 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_b0: +; V7A: @ %bb.0: +; V7A-NEXT: subs r12, r2, #32 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: lsl r2, r3, r2 +; V7A-NEXT: lslpl r3, r3, r12 +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: bic r1, r1, r3 +; V7A-NEXT: bic r0, r0, r2 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi64_b0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: lsl.w r12, r3, r2 +; V7A-T-NEXT: subs r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl.w r12, #0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl r3, r2 +; V7A-T-NEXT: bic.w r0, r0, r12 +; V7A-T-NEXT: bics r1, r3 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi64_b0: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r7, lr} +; V6M-NEXT: push {r4, r5, r7, lr} +; V6M-NEXT: mov r4, r1 +; V6M-NEXT: mov r5, r0 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: bics r5, r0 +; V6M-NEXT: bics r4, r1 +; V6M-NEXT: mov r0, r5 +; V6M-NEXT: mov r1, r4 +; V6M-NEXT: pop {r4, r5, r7, pc} + %notmask = shl i64 -1, %numlowbits + %mask = xor i64 %notmask, -1 + %masked = and i64 %mask, %val + ret i64 %masked +} + +define i64 @bzhi64_b1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind { +; V7M-LABEL: bzhi64_b1_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: mov.w r3, #-1 +; V7M-NEXT: lsl.w r12, r3, r2 +; V7M-NEXT: subs r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: movpl.w r12, #0 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl r3, r2 +; V7M-NEXT: bic.w r0, r0, r12 +; V7M-NEXT: bics r1, r3 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_b1_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: subs r12, r2, #32 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: lsl r2, r3, r2 +; V7A-NEXT: lslpl r3, r3, r12 +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: bic r1, r1, r3 +; V7A-NEXT: bic r0, r0, r2 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi64_b1_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: lsl.w r12, r3, r2 +; V7A-T-NEXT: subs r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl.w r12, #0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl r3, r2 +; V7A-T-NEXT: bic.w r0, r0, r12 +; V7A-T-NEXT: bics r1, r3 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi64_b1_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r7, lr} +; V6M-NEXT: push {r4, r5, r7, lr} +; V6M-NEXT: mov r4, r1 +; V6M-NEXT: mov r5, r0 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: bics r5, r0 +; V6M-NEXT: bics r4, r1 +; V6M-NEXT: mov r0, r5 +; V6M-NEXT: mov r1, r4 +; V6M-NEXT: pop {r4, r5, r7, pc} + %conv = zext i8 %numlowbits to i64 + %notmask = shl i64 -1, %conv + %mask = xor i64 %notmask, -1 + %masked = and i64 %mask, %val + ret i64 %masked +} + +define i64 @bzhi64_b2_load(ptr %w, i64 %numlowbits) nounwind { +; V7M-LABEL: bzhi64_b2_load: +; V7M: @ %bb.0: +; V7M-NEXT: mov.w r1, #-1 +; V7M-NEXT: subs.w r12, r2, #32 +; V7M-NEXT: lsl.w r3, r1, r2 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r3, #0 +; V7M-NEXT: ldrd r0, r2, [r0] +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r1, r1, r12 +; V7M-NEXT: bics r0, r3 +; V7M-NEXT: bic.w r1, r2, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_b2_load: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, lr} +; V7A-NEXT: push {r4, lr} +; V7A-NEXT: ldr r4, [r0] +; V7A-NEXT: mvn r1, #0 +; V7A-NEXT: ldr r3, [r0, #4] +; V7A-NEXT: subs r0, r2, #32 +; V7A-NEXT: lsl r2, r1, r2 +; V7A-NEXT: lslpl r1, r1, r0 +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: bic r1, r3, r1 +; V7A-NEXT: bic r0, r4, r2 +; V7A-NEXT: pop {r4, pc} +; +; V7A-T-LABEL: bzhi64_b2_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: mov.w r1, #-1 +; V7A-T-NEXT: ldrd r0, r12, [r0] +; V7A-T-NEXT: lsl.w r3, r1, r2 +; V7A-T-NEXT: subs r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r3, #0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl r1, r2 +; V7A-T-NEXT: bics r0, r3 +; V7A-T-NEXT: bic.w r1, r12, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi64_b2_load: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: mov r4, r0 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: ldm r4!, {r2, r3} +; V6M-NEXT: bics r2, r0 +; V6M-NEXT: bics r3, r1 +; V6M-NEXT: mov r0, r2 +; V6M-NEXT: mov r1, r3 +; V6M-NEXT: pop {r4, pc} + %val = load i64, ptr %w + %notmask = shl i64 -1, %numlowbits + %mask = xor i64 %notmask, -1 + %masked = and i64 %mask, %val + ret i64 %masked +} + +define i64 @bzhi64_b3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind { +; V7M-LABEL: bzhi64_b3_load_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: mov.w r2, #-1 +; V7M-NEXT: subs.w r12, r1, #32 +; V7M-NEXT: lsl.w r3, r2, r1 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r3, #0 +; V7M-NEXT: ldrd r0, r1, [r0] +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r2, r2, r12 +; V7M-NEXT: bics r1, r2 +; V7M-NEXT: bics r0, r3 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_b3_load_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, r6, r11, lr} +; V7A-NEXT: push {r4, r6, r11, lr} +; V7A-NEXT: mvn r2, #0 +; V7A-NEXT: ldr r6, [r0] +; V7A-NEXT: ldr r3, [r0, #4] +; V7A-NEXT: subs r0, r1, #32 +; V7A-NEXT: lsl r4, r2, r1 +; V7A-NEXT: lslpl r2, r2, r0 +; V7A-NEXT: movwpl r4, #0 +; V7A-NEXT: bic r1, r3, r2 +; V7A-NEXT: bic r0, r6, r4 +; V7A-NEXT: pop {r4, r6, r11, pc} +; +; V7A-T-LABEL: bzhi64_b3_load_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: mov.w r2, #-1 +; V7A-T-NEXT: ldrd r0, r12, [r0] +; V7A-T-NEXT: lsl.w r3, r2, r1 +; V7A-T-NEXT: subs r1, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r3, #0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl r2, r1 +; V7A-T-NEXT: bics r0, r3 +; V7A-T-NEXT: bic.w r1, r12, r2 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi64_b3_load_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: mov r2, r1 +; V6M-NEXT: mov r4, r0 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: ldm r4!, {r2, r3} +; V6M-NEXT: bics r2, r0 +; V6M-NEXT: bics r3, r1 +; V6M-NEXT: mov r0, r2 +; V6M-NEXT: mov r1, r3 +; V6M-NEXT: pop {r4, pc} + %val = load i64, ptr %w + %conv = zext i8 %numlowbits to i64 + %notmask = shl i64 -1, %conv + %mask = xor i64 %notmask, -1 + %masked = and i64 %mask, %val + ret i64 %masked +} + +define i64 @bzhi64_b4_commutative(i64 %val, i64 %numlowbits) nounwind { +; V7M-LABEL: bzhi64_b4_commutative: +; V7M: @ %bb.0: +; V7M-NEXT: mov.w r3, #-1 +; V7M-NEXT: lsl.w r12, r3, r2 +; V7M-NEXT: subs r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: movpl.w r12, #0 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl r3, r2 +; V7M-NEXT: bic.w r0, r0, r12 +; V7M-NEXT: bics r1, r3 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_b4_commutative: +; V7A: @ %bb.0: +; V7A-NEXT: subs r12, r2, #32 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: lsl r2, r3, r2 +; V7A-NEXT: lslpl r3, r3, r12 +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: bic r1, r1, r3 +; V7A-NEXT: bic r0, r0, r2 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi64_b4_commutative: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: lsl.w r12, r3, r2 +; V7A-T-NEXT: subs r2, #32 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl.w r12, #0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl r3, r2 +; V7A-T-NEXT: bic.w r0, r0, r12 +; V7A-T-NEXT: bics r1, r3 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi64_b4_commutative: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r7, lr} +; V6M-NEXT: push {r4, r5, r7, lr} +; V6M-NEXT: mov r4, r1 +; V6M-NEXT: mov r5, r0 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: bics r5, r0 +; V6M-NEXT: bics r4, r1 +; V6M-NEXT: mov r0, r5 +; V6M-NEXT: mov r1, r4 +; V6M-NEXT: pop {r4, r5, r7, pc} + %notmask = shl i64 -1, %numlowbits + %mask = xor i64 %notmask, -1 + %masked = and i64 %val, %mask ; swapped order + ret i64 %masked +} + +; ---------------------------------------------------------------------------- ; +; Pattern c. 32-bit +; ---------------------------------------------------------------------------- ; + +define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind { +; V7M-LABEL: bzhi32_c0: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r1, r1, #32 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_c0: +; V7A: @ %bb.0: +; V7A-NEXT: rsb r1, r1, #32 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_c0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: rsb.w r1, r1, #32 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_c0: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #32 +; V6M-NEXT: subs r1, r2, r1 +; V6M-NEXT: lsls r0, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: bx lr + %numhighbits = sub i32 32, %numlowbits + %mask = lshr i32 -1, %numhighbits + %masked = and i32 %mask, %val + ret i32 %masked +} + +define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind { +; V7M-LABEL: bzhi32_c1_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r1, r1, #32 +; V7M-NEXT: uxtb r1, r1 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_c1_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: rsb r1, r1, #32 +; V7A-NEXT: uxtb r1, r1 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_c1_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: rsb.w r1, r1, #32 +; V7A-T-NEXT: uxtb r1, r1 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_c1_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #32 +; V6M-NEXT: subs r1, r2, r1 +; V6M-NEXT: uxtb r1, r1 +; V6M-NEXT: lsls r0, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: bx lr + %numhighbits = sub i8 32, %numlowbits + %sh_prom = zext i8 %numhighbits to i32 + %mask = lshr i32 -1, %sh_prom + %masked = and i32 %mask, %val + ret i32 %masked +} + +define i32 @bzhi32_c2_load(ptr %w, i32 %numlowbits) nounwind { +; V7M-LABEL: bzhi32_c2_load: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: rsb.w r1, r1, #32 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_c2_load: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: rsb r1, r1, #32 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_c2_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: rsb.w r1, r1, #32 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_c2_load: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #32 +; V6M-NEXT: subs r1, r2, r1 +; V6M-NEXT: ldr r0, [r0] +; V6M-NEXT: lsls r0, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: bx lr + %val = load i32, ptr %w + %numhighbits = sub i32 32, %numlowbits + %mask = lshr i32 -1, %numhighbits + %masked = and i32 %mask, %val + ret i32 %masked +} + +define i32 @bzhi32_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind { +; V7M-LABEL: bzhi32_c3_load_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r1, r1, #32 +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: uxtb r1, r1 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_c3_load_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: rsb r1, r1, #32 +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: uxtb r1, r1 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_c3_load_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: rsb.w r1, r1, #32 +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: uxtb r1, r1 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_c3_load_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #32 +; V6M-NEXT: subs r1, r2, r1 +; V6M-NEXT: uxtb r1, r1 +; V6M-NEXT: ldr r0, [r0] +; V6M-NEXT: lsls r0, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: bx lr + %val = load i32, ptr %w + %numhighbits = sub i8 32, %numlowbits + %sh_prom = zext i8 %numhighbits to i32 + %mask = lshr i32 -1, %sh_prom + %masked = and i32 %mask, %val + ret i32 %masked +} + +define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind { +; V7M-LABEL: bzhi32_c4_commutative: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r1, r1, #32 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_c4_commutative: +; V7A: @ %bb.0: +; V7A-NEXT: rsb r1, r1, #32 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_c4_commutative: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: rsb.w r1, r1, #32 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_c4_commutative: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #32 +; V6M-NEXT: subs r1, r2, r1 +; V6M-NEXT: lsls r0, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: bx lr + %numhighbits = sub i32 32, %numlowbits + %mask = lshr i32 -1, %numhighbits + %masked = and i32 %val, %mask ; swapped order + ret i32 %masked +} + +; 64-bit + +define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind { +; V7M-LABEL: bzhi64_c0: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: rsbs.w lr, r2, #32 +; V7M-NEXT: rsb.w r2, r2, #64 +; V7M-NEXT: mov.w r12, #-1 +; V7M-NEXT: mov.w r3, #-1 +; V7M-NEXT: lsr.w r2, r12, r2 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r3, r3, lr +; V7M-NEXT: it pl +; V7M-NEXT: movpl r2, #0 +; V7M-NEXT: ands r0, r3 +; V7M-NEXT: ands r1, r2 +; V7M-NEXT: pop {r7, pc} +; +; V7A-LABEL: bzhi64_c0: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r11, lr} +; V7A-NEXT: push {r11, lr} +; V7A-NEXT: rsbs lr, r2, #32 +; V7A-NEXT: rsb r2, r2, #64 +; V7A-NEXT: mvn r12, #0 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: lsr r2, r12, r2 +; V7A-NEXT: lsrpl r3, r3, lr +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: and r0, r3, r0 +; V7A-NEXT: and r1, r2, r1 +; V7A-NEXT: pop {r11, pc} +; +; V7A-T-LABEL: bzhi64_c0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r7, lr} +; V7A-T-NEXT: push {r7, lr} +; V7A-T-NEXT: rsbs.w lr, r2, #32 +; V7A-T-NEXT: rsb.w r2, r2, #64 +; V7A-T-NEXT: mov.w r12, #-1 +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: lsr.w r2, r12, r2 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r3, r3, lr +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r2, #0 +; V7A-T-NEXT: ands r0, r3 +; V7A-T-NEXT: ands r1, r2 +; V7A-T-NEXT: pop {r7, pc} +; +; V6M-LABEL: bzhi64_c0: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r7, lr} +; V6M-NEXT: push {r4, r5, r7, lr} +; V6M-NEXT: mov r4, r1 +; V6M-NEXT: mov r5, r0 +; V6M-NEXT: movs r0, #64 +; V6M-NEXT: subs r2, r0, r2 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ands r0, r5 +; V6M-NEXT: ands r1, r4 +; V6M-NEXT: pop {r4, r5, r7, pc} + %numhighbits = sub i64 64, %numlowbits + %mask = lshr i64 -1, %numhighbits + %masked = and i64 %mask, %val + ret i64 %masked +} + +define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind { +; V7M-LABEL: bzhi64_c1_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r2, r2, #64 +; V7M-NEXT: mov.w r3, #-1 +; V7M-NEXT: uxtb r2, r2 +; V7M-NEXT: subs.w r12, r2, #32 +; V7M-NEXT: lsr.w r2, r3, r2 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r3, r3, r12 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r2, #0 +; V7M-NEXT: ands r0, r3 +; V7M-NEXT: ands r1, r2 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_c1_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r11, lr} +; V7A-NEXT: push {r11, lr} +; V7A-NEXT: rsb lr, r2, #64 +; V7A-NEXT: mvn r2, #31 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: uxtb r12, lr +; V7A-NEXT: uxtab r2, r2, lr +; V7A-NEXT: lsr r12, r3, r12 +; V7A-NEXT: cmp r2, #0 +; V7A-NEXT: movwpl r12, #0 +; V7A-NEXT: lsrpl r3, r3, r2 +; V7A-NEXT: and r1, r12, r1 +; V7A-NEXT: and r0, r3, r0 +; V7A-NEXT: pop {r11, pc} +; +; V7A-T-LABEL: bzhi64_c1_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r7, lr} +; V7A-T-NEXT: push {r7, lr} +; V7A-T-NEXT: rsb.w lr, r2, #64 +; V7A-T-NEXT: mvn r2, #31 +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: uxtb.w r12, lr +; V7A-T-NEXT: uxtab r2, r2, lr +; V7A-T-NEXT: lsr.w r12, r3, r12 +; V7A-T-NEXT: cmp r2, #0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl.w r12, #0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl r3, r2 +; V7A-T-NEXT: and.w r1, r1, r12 +; V7A-T-NEXT: ands r0, r3 +; V7A-T-NEXT: pop {r7, pc} +; +; V6M-LABEL: bzhi64_c1_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r7, lr} +; V6M-NEXT: push {r4, r5, r7, lr} +; V6M-NEXT: mov r4, r1 +; V6M-NEXT: mov r5, r0 +; V6M-NEXT: movs r0, #64 +; V6M-NEXT: subs r0, r0, r2 +; V6M-NEXT: uxtb r2, r0 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ands r0, r5 +; V6M-NEXT: ands r1, r4 +; V6M-NEXT: pop {r4, r5, r7, pc} + %numhighbits = sub i8 64, %numlowbits + %sh_prom = zext i8 %numhighbits to i64 + %mask = lshr i64 -1, %sh_prom + %masked = and i64 %mask, %val + ret i64 %masked +} + +define i64 @bzhi64_c2_load(ptr %w, i64 %numlowbits) nounwind { +; V7M-LABEL: bzhi64_c2_load: +; V7M: @ %bb.0: +; V7M-NEXT: rsbs.w r1, r2, #32 +; V7M-NEXT: mov.w r3, #-1 +; V7M-NEXT: rsb.w r2, r2, #64 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl r3, r1 +; V7M-NEXT: ldrd r0, r1, [r0] +; V7M-NEXT: mov.w r12, #-1 +; V7M-NEXT: lsr.w r2, r12, r2 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r2, #0 +; V7M-NEXT: ands r0, r3 +; V7M-NEXT: ands r1, r2 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_c2_load: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r5, lr} +; V7A-NEXT: push {r5, lr} +; V7A-NEXT: rsbs r1, r2, #32 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: mvn r12, #0 +; V7A-NEXT: ldm r0, {r0, r5} +; V7A-NEXT: lsrpl r3, r3, r1 +; V7A-NEXT: rsb r1, r2, #64 +; V7A-NEXT: and r0, r3, r0 +; V7A-NEXT: lsr r1, r12, r1 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: and r1, r1, r5 +; V7A-NEXT: pop {r5, pc} +; +; V7A-T-LABEL: bzhi64_c2_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r7, lr} +; V7A-T-NEXT: push {r7, lr} +; V7A-T-NEXT: rsbs.w r1, r2, #32 +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: ldrd r0, lr, [r0] +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl r3, r1 +; V7A-T-NEXT: rsb.w r1, r2, #64 +; V7A-T-NEXT: mov.w r12, #-1 +; V7A-T-NEXT: and.w r0, r0, r3 +; V7A-T-NEXT: lsr.w r1, r12, r1 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: and.w r1, r1, lr +; V7A-T-NEXT: pop {r7, pc} +; +; V6M-LABEL: bzhi64_c2_load: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: mov r4, r0 +; V6M-NEXT: movs r0, #64 +; V6M-NEXT: subs r2, r0, r2 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ldm r4!, {r2, r3} +; V6M-NEXT: ands r0, r2 +; V6M-NEXT: ands r1, r3 +; V6M-NEXT: pop {r4, pc} + %val = load i64, ptr %w + %numhighbits = sub i64 64, %numlowbits + %mask = lshr i64 -1, %numhighbits + %masked = and i64 %mask, %val + ret i64 %masked +} + +define i64 @bzhi64_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind { +; V7M-LABEL: bzhi64_c3_load_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r1, r1, #64 +; V7M-NEXT: mov.w r3, #-1 +; V7M-NEXT: uxtb r1, r1 +; V7M-NEXT: subs.w r2, r1, #32 +; V7M-NEXT: lsr.w r1, r3, r1 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl r3, r2 +; V7M-NEXT: ldrd r0, r2, [r0] +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: ands r1, r2 +; V7M-NEXT: ands r0, r3 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_c3_load_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r4, r6, r11, lr} +; V7A-NEXT: push {r4, r6, r11, lr} +; V7A-NEXT: rsb r1, r1, #64 +; V7A-NEXT: mvn r4, #31 +; V7A-NEXT: mvn r2, #0 +; V7A-NEXT: ldr r6, [r0] +; V7A-NEXT: ldr r3, [r0, #4] +; V7A-NEXT: uxtb r0, r1 +; V7A-NEXT: uxtab r4, r4, r1 +; V7A-NEXT: lsr r0, r2, r0 +; V7A-NEXT: cmp r4, #0 +; V7A-NEXT: movwpl r0, #0 +; V7A-NEXT: and r1, r0, r3 +; V7A-NEXT: lsrpl r2, r2, r4 +; V7A-NEXT: and r0, r2, r6 +; V7A-NEXT: pop {r4, r6, r11, pc} +; +; V7A-T-LABEL: bzhi64_c3_load_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r7, lr} +; V7A-T-NEXT: push {r7, lr} +; V7A-T-NEXT: rsb.w r1, r1, #64 +; V7A-T-NEXT: mvn r3, #31 +; V7A-T-NEXT: ldrd r12, lr, [r0] +; V7A-T-NEXT: mov.w r2, #-1 +; V7A-T-NEXT: uxtb r0, r1 +; V7A-T-NEXT: uxtab r3, r3, r1 +; V7A-T-NEXT: lsr.w r0, r2, r0 +; V7A-T-NEXT: cmp r3, #0 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: and.w r1, r0, lr +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl r2, r3 +; V7A-T-NEXT: and.w r0, r2, r12 +; V7A-T-NEXT: pop {r7, pc} +; +; V6M-LABEL: bzhi64_c3_load_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: mov r4, r0 +; V6M-NEXT: movs r0, #64 +; V6M-NEXT: subs r0, r0, r1 +; V6M-NEXT: uxtb r2, r0 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ldm r4!, {r2, r3} +; V6M-NEXT: ands r0, r2 +; V6M-NEXT: ands r1, r3 +; V6M-NEXT: pop {r4, pc} + %val = load i64, ptr %w + %numhighbits = sub i8 64, %numlowbits + %sh_prom = zext i8 %numhighbits to i64 + %mask = lshr i64 -1, %sh_prom + %masked = and i64 %mask, %val + ret i64 %masked +} + +define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind { +; V7M-LABEL: bzhi64_c4_commutative: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: rsbs.w lr, r2, #32 +; V7M-NEXT: rsb.w r2, r2, #64 +; V7M-NEXT: mov.w r12, #-1 +; V7M-NEXT: mov.w r3, #-1 +; V7M-NEXT: lsr.w r2, r12, r2 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r3, r3, lr +; V7M-NEXT: it pl +; V7M-NEXT: movpl r2, #0 +; V7M-NEXT: ands r0, r3 +; V7M-NEXT: ands r1, r2 +; V7M-NEXT: pop {r7, pc} +; +; V7A-LABEL: bzhi64_c4_commutative: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r11, lr} +; V7A-NEXT: push {r11, lr} +; V7A-NEXT: rsbs lr, r2, #32 +; V7A-NEXT: rsb r2, r2, #64 +; V7A-NEXT: mvn r12, #0 +; V7A-NEXT: mvn r3, #0 +; V7A-NEXT: lsr r2, r12, r2 +; V7A-NEXT: lsrpl r3, r3, lr +; V7A-NEXT: movwpl r2, #0 +; V7A-NEXT: and r0, r0, r3 +; V7A-NEXT: and r1, r1, r2 +; V7A-NEXT: pop {r11, pc} +; +; V7A-T-LABEL: bzhi64_c4_commutative: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r7, lr} +; V7A-T-NEXT: push {r7, lr} +; V7A-T-NEXT: rsbs.w lr, r2, #32 +; V7A-T-NEXT: rsb.w r2, r2, #64 +; V7A-T-NEXT: mov.w r12, #-1 +; V7A-T-NEXT: mov.w r3, #-1 +; V7A-T-NEXT: lsr.w r2, r12, r2 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r3, r3, lr +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r2, #0 +; V7A-T-NEXT: ands r0, r3 +; V7A-T-NEXT: ands r1, r2 +; V7A-T-NEXT: pop {r7, pc} +; +; V6M-LABEL: bzhi64_c4_commutative: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r7, lr} +; V6M-NEXT: push {r4, r5, r7, lr} +; V6M-NEXT: mov r4, r1 +; V6M-NEXT: mov r5, r0 +; V6M-NEXT: movs r0, #64 +; V6M-NEXT: subs r2, r0, r2 +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: ands r0, r5 +; V6M-NEXT: ands r1, r4 +; V6M-NEXT: pop {r4, r5, r7, pc} + %numhighbits = sub i64 64, %numlowbits + %mask = lshr i64 -1, %numhighbits + %masked = and i64 %val, %mask ; swapped order + ret i64 %masked +} + +; ---------------------------------------------------------------------------- ; +; Pattern d. 32-bit. +; ---------------------------------------------------------------------------- ; + +define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind { +; V7M-LABEL: bzhi32_d0: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r1, r1, #32 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_d0: +; V7A: @ %bb.0: +; V7A-NEXT: rsb r1, r1, #32 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_d0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: rsb.w r1, r1, #32 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_d0: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #32 +; V6M-NEXT: subs r1, r2, r1 +; V6M-NEXT: lsls r0, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: bx lr + %numhighbits = sub i32 32, %numlowbits + %highbitscleared = shl i32 %val, %numhighbits + %masked = lshr i32 %highbitscleared, %numhighbits + ret i32 %masked +} + +define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind { +; V7M-LABEL: bzhi32_d1_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r1, r1, #32 +; V7M-NEXT: uxtb r1, r1 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_d1_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: rsb r1, r1, #32 +; V7A-NEXT: uxtb r1, r1 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_d1_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: rsb.w r1, r1, #32 +; V7A-T-NEXT: uxtb r1, r1 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_d1_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #32 +; V6M-NEXT: subs r1, r2, r1 +; V6M-NEXT: uxtb r1, r1 +; V6M-NEXT: lsls r0, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: bx lr + %numhighbits = sub i8 32, %numlowbits + %sh_prom = zext i8 %numhighbits to i32 + %highbitscleared = shl i32 %val, %sh_prom + %masked = lshr i32 %highbitscleared, %sh_prom + ret i32 %masked +} + +define i32 @bzhi32_d2_load(ptr %w, i32 %numlowbits) nounwind { +; V7M-LABEL: bzhi32_d2_load: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: rsb.w r1, r1, #32 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_d2_load: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: rsb r1, r1, #32 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_d2_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: rsb.w r1, r1, #32 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_d2_load: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #32 +; V6M-NEXT: subs r1, r2, r1 +; V6M-NEXT: ldr r0, [r0] +; V6M-NEXT: lsls r0, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: bx lr + %val = load i32, ptr %w + %numhighbits = sub i32 32, %numlowbits + %highbitscleared = shl i32 %val, %numhighbits + %masked = lshr i32 %highbitscleared, %numhighbits + ret i32 %masked +} + +define i32 @bzhi32_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind { +; V7M-LABEL: bzhi32_d3_load_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r1, r1, #32 +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: uxtb r1, r1 +; V7M-NEXT: lsls r0, r1 +; V7M-NEXT: lsrs r0, r1 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_d3_load_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: rsb r1, r1, #32 +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: uxtb r1, r1 +; V7A-NEXT: lsl r0, r0, r1 +; V7A-NEXT: lsr r0, r0, r1 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_d3_load_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: rsb.w r1, r1, #32 +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: uxtb r1, r1 +; V7A-T-NEXT: lsls r0, r1 +; V7A-T-NEXT: lsrs r0, r1 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_d3_load_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #32 +; V6M-NEXT: subs r1, r2, r1 +; V6M-NEXT: uxtb r1, r1 +; V6M-NEXT: ldr r0, [r0] +; V6M-NEXT: lsls r0, r1 +; V6M-NEXT: lsrs r0, r1 +; V6M-NEXT: bx lr + %val = load i32, ptr %w + %numhighbits = sub i8 32, %numlowbits + %sh_prom = zext i8 %numhighbits to i32 + %highbitscleared = shl i32 %val, %sh_prom + %masked = lshr i32 %highbitscleared, %sh_prom + ret i32 %masked +} + +; 64-bit. + +define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind { +; V7M-LABEL: bzhi64_d0: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: rsb.w r3, r2, #64 +; V7M-NEXT: rsbs.w r2, r2, #32 +; V7M-NEXT: rsb.w lr, r3, #32 +; V7M-NEXT: lsl.w r12, r1, r3 +; V7M-NEXT: lsr.w r1, r0, lr +; V7M-NEXT: orr.w r1, r1, r12 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r1, r0, r2 +; V7M-NEXT: lsl.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r0, #0 +; V7M-NEXT: lsl.w r12, r1, lr +; V7M-NEXT: lsr.w r0, r0, r3 +; V7M-NEXT: orr.w r0, r0, r12 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r2 +; V7M-NEXT: lsr.w r1, r1, r3 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: pop {r7, pc} +; +; V7A-LABEL: bzhi64_d0: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r11, lr} +; V7A-NEXT: push {r11, lr} +; V7A-NEXT: rsb lr, r2, #64 +; V7A-NEXT: rsbs r2, r2, #32 +; V7A-NEXT: rsb r12, lr, #32 +; V7A-NEXT: lsr r3, r0, r12 +; V7A-NEXT: orr r1, r3, r1, lsl lr +; V7A-NEXT: lslpl r1, r0, r2 +; V7A-NEXT: lsl r0, r0, lr +; V7A-NEXT: movwpl r0, #0 +; V7A-NEXT: lsr r0, r0, lr +; V7A-NEXT: orr r0, r0, r1, lsl r12 +; V7A-NEXT: lsrpl r0, r1, r2 +; V7A-NEXT: lsr r1, r1, lr +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: pop {r11, pc} +; +; V7A-T-LABEL: bzhi64_d0: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r7, lr} +; V7A-T-NEXT: push {r7, lr} +; V7A-T-NEXT: rsb.w r3, r2, #64 +; V7A-T-NEXT: rsbs.w r2, r2, #32 +; V7A-T-NEXT: rsb.w lr, r3, #32 +; V7A-T-NEXT: lsl.w r12, r1, r3 +; V7A-T-NEXT: lsr.w r1, r0, lr +; V7A-T-NEXT: orr.w r1, r1, r12 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r1, r0, r2 +; V7A-T-NEXT: lsl.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: lsl.w r12, r1, lr +; V7A-T-NEXT: lsr.w r0, r0, r3 +; V7A-T-NEXT: orr.w r0, r0, r12 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r2 +; V7A-T-NEXT: lsr.w r1, r1, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: pop {r7, pc} +; +; V6M-LABEL: bzhi64_d0: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: movs r3, #64 +; V6M-NEXT: subs r4, r3, r2 +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: pop {r4, pc} + %numhighbits = sub i64 64, %numlowbits + %highbitscleared = shl i64 %val, %numhighbits + %masked = lshr i64 %highbitscleared, %numhighbits + ret i64 %masked +} + +define i64 @bzhi64_d1_indexzext(i64 %val, i8 %numlowbits) nounwind { +; V7M-LABEL: bzhi64_d1_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r2, r2, #64 +; V7M-NEXT: uxtb r2, r2 +; V7M-NEXT: rsb.w r3, r2, #32 +; V7M-NEXT: lsl.w r12, r1, r2 +; V7M-NEXT: lsr.w r1, r0, r3 +; V7M-NEXT: orr.w r1, r1, r12 +; V7M-NEXT: subs.w r12, r2, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r1, r0, r12 +; V7M-NEXT: lsl.w r0, r0, r2 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r0, #0 +; V7M-NEXT: lsl.w r3, r1, r3 +; V7M-NEXT: lsr.w r0, r0, r2 +; V7M-NEXT: orr.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r1, r12 +; V7M-NEXT: lsr.w r1, r1, r2 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_d1_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r11, lr} +; V7A-NEXT: push {r11, lr} +; V7A-NEXT: rsb lr, r2, #64 +; V7A-NEXT: uxtb r3, lr +; V7A-NEXT: rsb r12, r3, #32 +; V7A-NEXT: lsr r2, r0, r12 +; V7A-NEXT: orr r1, r2, r1, lsl r3 +; V7A-NEXT: mvn r2, #31 +; V7A-NEXT: uxtab r2, r2, lr +; V7A-NEXT: cmp r2, #0 +; V7A-NEXT: lslpl r1, r0, r2 +; V7A-NEXT: lsl r0, r0, r3 +; V7A-NEXT: movwpl r0, #0 +; V7A-NEXT: lsr r0, r0, r3 +; V7A-NEXT: orr r0, r0, r1, lsl r12 +; V7A-NEXT: lsrpl r0, r1, r2 +; V7A-NEXT: lsr r1, r1, r3 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: pop {r11, pc} +; +; V7A-T-LABEL: bzhi64_d1_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r4, lr} +; V7A-T-NEXT: push {r4, lr} +; V7A-T-NEXT: rsb.w r4, r2, #64 +; V7A-T-NEXT: mvn r2, #31 +; V7A-T-NEXT: uxtb r3, r4 +; V7A-T-NEXT: rsb.w lr, r3, #32 +; V7A-T-NEXT: lsl.w r12, r1, r3 +; V7A-T-NEXT: uxtab r2, r2, r4 +; V7A-T-NEXT: lsr.w r1, r0, lr +; V7A-T-NEXT: cmp r2, #0 +; V7A-T-NEXT: orr.w r1, r1, r12 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r1, r0, r2 +; V7A-T-NEXT: lsl.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: lsl.w r4, r1, lr +; V7A-T-NEXT: lsr.w r0, r0, r3 +; V7A-T-NEXT: orr.w r0, r0, r4 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r2 +; V7A-T-NEXT: lsr.w r1, r1, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: pop {r4, pc} +; +; V6M-LABEL: bzhi64_d1_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: movs r3, #64 +; V6M-NEXT: subs r2, r3, r2 +; V6M-NEXT: uxtb r4, r2 +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: pop {r4, pc} + %numhighbits = sub i8 64, %numlowbits + %sh_prom = zext i8 %numhighbits to i64 + %highbitscleared = shl i64 %val, %sh_prom + %masked = lshr i64 %highbitscleared, %sh_prom + ret i64 %masked +} + +define i64 @bzhi64_d2_load(ptr %w, i64 %numlowbits) nounwind { +; V7M-LABEL: bzhi64_d2_load: +; V7M: @ %bb.0: +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: rsb.w r1, r2, #64 +; V7M-NEXT: ldrd r0, r3, [r0] +; V7M-NEXT: rsb.w lr, r1, #32 +; V7M-NEXT: rsbs.w r2, r2, #32 +; V7M-NEXT: lsl.w r12, r3, r1 +; V7M-NEXT: lsr.w r3, r0, lr +; V7M-NEXT: orr.w r3, r3, r12 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r3, r0, r2 +; V7M-NEXT: lsl.w r0, r0, r1 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r0, #0 +; V7M-NEXT: lsl.w r12, r3, lr +; V7M-NEXT: lsr.w r0, r0, r1 +; V7M-NEXT: lsr.w r1, r3, r1 +; V7M-NEXT: orr.w r0, r0, r12 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r3, r2 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: pop {r7, pc} +; +; V7A-LABEL: bzhi64_d2_load: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r5, r7, r11, lr} +; V7A-NEXT: push {r5, r7, r11, lr} +; V7A-NEXT: rsb r3, r2, #64 +; V7A-NEXT: ldm r0, {r0, r7} +; V7A-NEXT: rsb r1, r3, #32 +; V7A-NEXT: rsbs r2, r2, #32 +; V7A-NEXT: lsr r5, r0, r1 +; V7A-NEXT: orr r7, r5, r7, lsl r3 +; V7A-NEXT: lslpl r7, r0, r2 +; V7A-NEXT: lsl r0, r0, r3 +; V7A-NEXT: movwpl r0, #0 +; V7A-NEXT: lsr r0, r0, r3 +; V7A-NEXT: orr r0, r0, r7, lsl r1 +; V7A-NEXT: lsr r1, r7, r3 +; V7A-NEXT: lsrpl r0, r7, r2 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: pop {r5, r7, r11, pc} +; +; V7A-T-LABEL: bzhi64_d2_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r7, lr} +; V7A-T-NEXT: push {r7, lr} +; V7A-T-NEXT: rsb.w r3, r2, #64 +; V7A-T-NEXT: ldrd r0, r1, [r0] +; V7A-T-NEXT: rsb.w lr, r3, #32 +; V7A-T-NEXT: rsbs.w r2, r2, #32 +; V7A-T-NEXT: lsl.w r12, r1, r3 +; V7A-T-NEXT: lsr.w r1, r0, lr +; V7A-T-NEXT: orr.w r1, r1, r12 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r1, r0, r2 +; V7A-T-NEXT: lsl.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: lsl.w r12, r1, lr +; V7A-T-NEXT: lsr.w r0, r0, r3 +; V7A-T-NEXT: orr.w r0, r0, r12 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r1, r2 +; V7A-T-NEXT: lsr.w r1, r1, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: pop {r7, pc} +; +; V6M-LABEL: bzhi64_d2_load: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: movs r1, #64 +; V6M-NEXT: subs r4, r1, r2 +; V6M-NEXT: ldr r2, [r0] +; V6M-NEXT: ldr r1, [r0, #4] +; V6M-NEXT: mov r0, r2 +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: pop {r4, pc} + %val = load i64, ptr %w + %numhighbits = sub i64 64, %numlowbits + %highbitscleared = shl i64 %val, %numhighbits + %masked = lshr i64 %highbitscleared, %numhighbits + ret i64 %masked +} + +define i64 @bzhi64_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind { +; V7M-LABEL: bzhi64_d3_load_indexzext: +; V7M: @ %bb.0: +; V7M-NEXT: rsb.w r1, r1, #64 +; V7M-NEXT: ldrd r0, r2, [r0] +; V7M-NEXT: uxtb r1, r1 +; V7M-NEXT: rsb.w r3, r1, #32 +; V7M-NEXT: lsl.w r12, r2, r1 +; V7M-NEXT: lsr.w r2, r0, r3 +; V7M-NEXT: orr.w r2, r2, r12 +; V7M-NEXT: subs.w r12, r1, #32 +; V7M-NEXT: it pl +; V7M-NEXT: lslpl.w r2, r0, r12 +; V7M-NEXT: lsl.w r0, r0, r1 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r0, #0 +; V7M-NEXT: lsl.w r3, r2, r3 +; V7M-NEXT: lsr.w r0, r0, r1 +; V7M-NEXT: lsr.w r1, r2, r1 +; V7M-NEXT: orr.w r0, r0, r3 +; V7M-NEXT: it pl +; V7M-NEXT: lsrpl.w r0, r2, r12 +; V7M-NEXT: it pl +; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_d3_load_indexzext: +; V7A: @ %bb.0: +; V7A-NEXT: .save {r5, r7, r11, lr} +; V7A-NEXT: push {r5, r7, r11, lr} +; V7A-NEXT: rsb r1, r1, #64 +; V7A-NEXT: ldm r0, {r0, r7} +; V7A-NEXT: uxtb r2, r1 +; V7A-NEXT: rsb r3, r2, #32 +; V7A-NEXT: lsr r5, r0, r3 +; V7A-NEXT: orr r7, r5, r7, lsl r2 +; V7A-NEXT: mvn r5, #31 +; V7A-NEXT: uxtab r1, r5, r1 +; V7A-NEXT: cmp r1, #0 +; V7A-NEXT: lslpl r7, r0, r1 +; V7A-NEXT: lsl r0, r0, r2 +; V7A-NEXT: movwpl r0, #0 +; V7A-NEXT: lsr r0, r0, r2 +; V7A-NEXT: orr r0, r0, r7, lsl r3 +; V7A-NEXT: lsrpl r0, r7, r1 +; V7A-NEXT: lsr r1, r7, r2 +; V7A-NEXT: movwpl r1, #0 +; V7A-NEXT: pop {r5, r7, r11, pc} +; +; V7A-T-LABEL: bzhi64_d3_load_indexzext: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: .save {r4, lr} +; V7A-T-NEXT: push {r4, lr} +; V7A-T-NEXT: rsb.w r4, r1, #64 +; V7A-T-NEXT: ldrd r0, r2, [r0] +; V7A-T-NEXT: mvn r1, #31 +; V7A-T-NEXT: uxtb r3, r4 +; V7A-T-NEXT: rsb.w lr, r3, #32 +; V7A-T-NEXT: lsl.w r12, r2, r3 +; V7A-T-NEXT: uxtab r1, r1, r4 +; V7A-T-NEXT: lsr.w r2, r0, lr +; V7A-T-NEXT: cmp r1, #0 +; V7A-T-NEXT: orr.w r2, r2, r12 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lslpl.w r2, r0, r1 +; V7A-T-NEXT: lsl.w r0, r0, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r0, #0 +; V7A-T-NEXT: lsl.w r4, r2, lr +; V7A-T-NEXT: lsr.w r0, r0, r3 +; V7A-T-NEXT: orr.w r0, r0, r4 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: lsrpl.w r0, r2, r1 +; V7A-T-NEXT: lsr.w r1, r2, r3 +; V7A-T-NEXT: it pl +; V7A-T-NEXT: movpl r1, #0 +; V7A-T-NEXT: pop {r4, pc} +; +; V6M-LABEL: bzhi64_d3_load_indexzext: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: movs r2, #64 +; V6M-NEXT: subs r1, r2, r1 +; V6M-NEXT: uxtb r4, r1 +; V6M-NEXT: ldr r2, [r0] +; V6M-NEXT: ldr r1, [r0, #4] +; V6M-NEXT: mov r0, r2 +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsl +; V6M-NEXT: mov r2, r4 +; V6M-NEXT: bl __aeabi_llsr +; V6M-NEXT: pop {r4, pc} + %val = load i64, ptr %w + %numhighbits = sub i8 64, %numlowbits + %sh_prom = zext i8 %numhighbits to i64 + %highbitscleared = shl i64 %val, %sh_prom + %masked = lshr i64 %highbitscleared, %sh_prom + ret i64 %masked +} + +; ---------------------------------------------------------------------------- ; +; Constant mask +; ---------------------------------------------------------------------------- ; + +; 32-bit + +define i32 @bzhi32_constant_mask32(i32 %val) nounwind { +; V7M-LABEL: bzhi32_constant_mask32: +; V7M: @ %bb.0: +; V7M-NEXT: bic r0, r0, #-2147483648 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_constant_mask32: +; V7A: @ %bb.0: +; V7A-NEXT: bic r0, r0, #-2147483648 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_constant_mask32: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: bic r0, r0, #-2147483648 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_constant_mask32: +; V6M: @ %bb.0: +; V6M-NEXT: movs r1, #1 +; V6M-NEXT: lsls r1, r1, #31 +; V6M-NEXT: bics r0, r1 +; V6M-NEXT: bx lr + %masked = and i32 %val, 2147483647 + ret i32 %masked +} + +define i32 @bzhi32_constant_mask32_load(ptr %val) nounwind { +; V7M-LABEL: bzhi32_constant_mask32_load: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: bic r0, r0, #-2147483648 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_constant_mask32_load: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: bic r0, r0, #-2147483648 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_constant_mask32_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: bic r0, r0, #-2147483648 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_constant_mask32_load: +; V6M: @ %bb.0: +; V6M-NEXT: movs r1, #1 +; V6M-NEXT: lsls r1, r1, #31 +; V6M-NEXT: ldr r0, [r0] +; V6M-NEXT: bics r0, r1 +; V6M-NEXT: bx lr + %val1 = load i32, ptr %val + %masked = and i32 %val1, 2147483647 + ret i32 %masked +} + +define i32 @bzhi32_constant_mask16(i32 %val) nounwind { +; V7M-LABEL: bzhi32_constant_mask16: +; V7M: @ %bb.0: +; V7M-NEXT: bfc r0, #15, #17 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_constant_mask16: +; V7A: @ %bb.0: +; V7A-NEXT: bfc r0, #15, #17 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_constant_mask16: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: bfc r0, #15, #17 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_constant_mask16: +; V6M: @ %bb.0: +; V6M-NEXT: ldr r1, .LCPI41_0 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: bx lr +; V6M-NEXT: .p2align 2 +; V6M-NEXT: @ %bb.1: +; V6M-NEXT: .LCPI41_0: +; V6M-NEXT: .long 32767 @ 0x7fff + %masked = and i32 %val, 32767 + ret i32 %masked +} + +define i32 @bzhi32_constant_mask16_load(ptr %val) nounwind { +; V7M-LABEL: bzhi32_constant_mask16_load: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: bfc r0, #15, #17 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_constant_mask16_load: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: bfc r0, #15, #17 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_constant_mask16_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: bfc r0, #15, #17 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_constant_mask16_load: +; V6M: @ %bb.0: +; V6M-NEXT: ldr r1, [r0] +; V6M-NEXT: ldr r0, .LCPI42_0 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: bx lr +; V6M-NEXT: .p2align 2 +; V6M-NEXT: @ %bb.1: +; V6M-NEXT: .LCPI42_0: +; V6M-NEXT: .long 32767 @ 0x7fff + %val1 = load i32, ptr %val + %masked = and i32 %val1, 32767 + ret i32 %masked +} + +define i32 @bzhi32_constant_mask8(i32 %val) nounwind { +; V7M-LABEL: bzhi32_constant_mask8: +; V7M: @ %bb.0: +; V7M-NEXT: and r0, r0, #127 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_constant_mask8: +; V7A: @ %bb.0: +; V7A-NEXT: and r0, r0, #127 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_constant_mask8: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: and r0, r0, #127 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_constant_mask8: +; V6M: @ %bb.0: +; V6M-NEXT: movs r1, #127 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: bx lr + %masked = and i32 %val, 127 + ret i32 %masked +} + +define i32 @bzhi32_constant_mask8_load(ptr %val) nounwind { +; V7M-LABEL: bzhi32_constant_mask8_load: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: and r0, r0, #127 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi32_constant_mask8_load: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: and r0, r0, #127 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi32_constant_mask8_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: and r0, r0, #127 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi32_constant_mask8_load: +; V6M: @ %bb.0: +; V6M-NEXT: ldr r1, [r0] +; V6M-NEXT: movs r0, #127 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: bx lr + %val1 = load i32, ptr %val + %masked = and i32 %val1, 127 + ret i32 %masked +} + +; 64-bit + +define i64 @bzhi64_constant_mask64(i64 %val) nounwind { +; V7M-LABEL: bzhi64_constant_mask64: +; V7M: @ %bb.0: +; V7M-NEXT: bic r1, r1, #-1073741824 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_constant_mask64: +; V7A: @ %bb.0: +; V7A-NEXT: bic r1, r1, #-1073741824 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi64_constant_mask64: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: bic r1, r1, #-1073741824 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi64_constant_mask64: +; V6M: @ %bb.0: +; V6M-NEXT: movs r2, #3 +; V6M-NEXT: lsls r2, r2, #30 +; V6M-NEXT: bics r1, r2 +; V6M-NEXT: bx lr + %masked = and i64 %val, 4611686018427387903 + ret i64 %masked +} + +define i64 @bzhi64_constant_mask64_load(ptr %val) nounwind { +; V7M-LABEL: bzhi64_constant_mask64_load: +; V7M: @ %bb.0: +; V7M-NEXT: ldrd r0, r1, [r0] +; V7M-NEXT: bic r1, r1, #-1073741824 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_constant_mask64_load: +; V7A: @ %bb.0: +; V7A-NEXT: ldrd r0, r1, [r0] +; V7A-NEXT: bic r1, r1, #-1073741824 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi64_constant_mask64_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldrd r0, r1, [r0] +; V7A-T-NEXT: bic r1, r1, #-1073741824 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi64_constant_mask64_load: +; V6M: @ %bb.0: +; V6M-NEXT: movs r1, #3 +; V6M-NEXT: lsls r3, r1, #30 +; V6M-NEXT: ldr r2, [r0] +; V6M-NEXT: ldr r1, [r0, #4] +; V6M-NEXT: bics r1, r3 +; V6M-NEXT: mov r0, r2 +; V6M-NEXT: bx lr + %val1 = load i64, ptr %val + %masked = and i64 %val1, 4611686018427387903 + ret i64 %masked +} + +define i64 @bzhi64_constant_mask32(i64 %val) nounwind { +; V7M-LABEL: bzhi64_constant_mask32: +; V7M: @ %bb.0: +; V7M-NEXT: bic r0, r0, #-2147483648 +; V7M-NEXT: movs r1, #0 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_constant_mask32: +; V7A: @ %bb.0: +; V7A-NEXT: bic r0, r0, #-2147483648 +; V7A-NEXT: mov r1, #0 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi64_constant_mask32: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: bic r0, r0, #-2147483648 +; V7A-T-NEXT: movs r1, #0 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi64_constant_mask32: +; V6M: @ %bb.0: +; V6M-NEXT: movs r1, #1 +; V6M-NEXT: lsls r1, r1, #31 +; V6M-NEXT: bics r0, r1 +; V6M-NEXT: movs r1, #0 +; V6M-NEXT: bx lr + %masked = and i64 %val, 2147483647 + ret i64 %masked +} + +define i64 @bzhi64_constant_mask32_load(ptr %val) nounwind { +; V7M-LABEL: bzhi64_constant_mask32_load: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: movs r1, #0 +; V7M-NEXT: bic r0, r0, #-2147483648 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_constant_mask32_load: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: mov r1, #0 +; V7A-NEXT: bic r0, r0, #-2147483648 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi64_constant_mask32_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: movs r1, #0 +; V7A-T-NEXT: bic r0, r0, #-2147483648 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi64_constant_mask32_load: +; V6M: @ %bb.0: +; V6M-NEXT: movs r1, #1 +; V6M-NEXT: lsls r1, r1, #31 +; V6M-NEXT: ldr r0, [r0] +; V6M-NEXT: bics r0, r1 +; V6M-NEXT: movs r1, #0 +; V6M-NEXT: bx lr + %val1 = load i64, ptr %val + %masked = and i64 %val1, 2147483647 + ret i64 %masked +} + +define i64 @bzhi64_constant_mask16(i64 %val) nounwind { +; V7M-LABEL: bzhi64_constant_mask16: +; V7M: @ %bb.0: +; V7M-NEXT: bfc r0, #15, #17 +; V7M-NEXT: movs r1, #0 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_constant_mask16: +; V7A: @ %bb.0: +; V7A-NEXT: bfc r0, #15, #17 +; V7A-NEXT: mov r1, #0 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi64_constant_mask16: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: bfc r0, #15, #17 +; V7A-T-NEXT: movs r1, #0 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi64_constant_mask16: +; V6M: @ %bb.0: +; V6M-NEXT: ldr r1, .LCPI49_0 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: movs r1, #0 +; V6M-NEXT: bx lr +; V6M-NEXT: .p2align 2 +; V6M-NEXT: @ %bb.1: +; V6M-NEXT: .LCPI49_0: +; V6M-NEXT: .long 32767 @ 0x7fff + %masked = and i64 %val, 32767 + ret i64 %masked +} + +define i64 @bzhi64_constant_mask16_load(ptr %val) nounwind { +; V7M-LABEL: bzhi64_constant_mask16_load: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: movs r1, #0 +; V7M-NEXT: bfc r0, #15, #17 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_constant_mask16_load: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: mov r1, #0 +; V7A-NEXT: bfc r0, #15, #17 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi64_constant_mask16_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: movs r1, #0 +; V7A-T-NEXT: bfc r0, #15, #17 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi64_constant_mask16_load: +; V6M: @ %bb.0: +; V6M-NEXT: ldr r1, [r0] +; V6M-NEXT: ldr r0, .LCPI50_0 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: movs r1, #0 +; V6M-NEXT: bx lr +; V6M-NEXT: .p2align 2 +; V6M-NEXT: @ %bb.1: +; V6M-NEXT: .LCPI50_0: +; V6M-NEXT: .long 32767 @ 0x7fff + %val1 = load i64, ptr %val + %masked = and i64 %val1, 32767 + ret i64 %masked +} + +define i64 @bzhi64_constant_mask8(i64 %val) nounwind { +; V7M-LABEL: bzhi64_constant_mask8: +; V7M: @ %bb.0: +; V7M-NEXT: and r0, r0, #127 +; V7M-NEXT: movs r1, #0 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_constant_mask8: +; V7A: @ %bb.0: +; V7A-NEXT: and r0, r0, #127 +; V7A-NEXT: mov r1, #0 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi64_constant_mask8: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: and r0, r0, #127 +; V7A-T-NEXT: movs r1, #0 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi64_constant_mask8: +; V6M: @ %bb.0: +; V6M-NEXT: movs r1, #127 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: movs r1, #0 +; V6M-NEXT: bx lr + %masked = and i64 %val, 127 + ret i64 %masked +} + +define i64 @bzhi64_constant_mask8_load(ptr %val) nounwind { +; V7M-LABEL: bzhi64_constant_mask8_load: +; V7M: @ %bb.0: +; V7M-NEXT: ldr r0, [r0] +; V7M-NEXT: movs r1, #0 +; V7M-NEXT: and r0, r0, #127 +; V7M-NEXT: bx lr +; +; V7A-LABEL: bzhi64_constant_mask8_load: +; V7A: @ %bb.0: +; V7A-NEXT: ldr r0, [r0] +; V7A-NEXT: mov r1, #0 +; V7A-NEXT: and r0, r0, #127 +; V7A-NEXT: bx lr +; +; V7A-T-LABEL: bzhi64_constant_mask8_load: +; V7A-T: @ %bb.0: +; V7A-T-NEXT: ldr r0, [r0] +; V7A-T-NEXT: movs r1, #0 +; V7A-T-NEXT: and r0, r0, #127 +; V7A-T-NEXT: bx lr +; +; V6M-LABEL: bzhi64_constant_mask8_load: +; V6M: @ %bb.0: +; V6M-NEXT: ldr r1, [r0] +; V6M-NEXT: movs r0, #127 +; V6M-NEXT: ands r0, r1 +; V6M-NEXT: movs r1, #0 +; V6M-NEXT: bx lr + %val1 = load i64, ptr %val + %masked = and i64 %val1, 127 + ret i64 %masked +} diff --git a/llvm/test/CodeGen/X86/isel-fpclass.ll b/llvm/test/CodeGen/X86/isel-fpclass.ll index 960bbf5..df04b67 100644 --- a/llvm/test/CodeGen/X86/isel-fpclass.ll +++ b/llvm/test/CodeGen/X86/isel-fpclass.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86-SDAGISEL +; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-SDAGISEL ; RUN: llc < %s -mtriple=i686-linux -fast-isel -fast-isel-abort=1 | FileCheck %s -check-prefixes=X86-FASTISEL ; RUN: llc < %s -mtriple=x86_64-linux -fast-isel -fast-isel-abort=1 | FileCheck %s -check-prefixes=X64,X64-FASTISEL +; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=2 | FileCheck %s -check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-linux -global-isel -global-isel-abort=2 | FileCheck %s -check-prefixes=X64,X64-GISEL -; FIXME: We can reuse/delete llvm/test/CodeGen/X86/is_fpclass.ll when all patches are included. - -define i1 @isnone_f(float %x) { -; X86-SDAGISEL-LABEL: isnone_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: xorl %eax, %eax -; X86-SDAGISEL-NEXT: retl +define i1 @isnone_f(float %x) nounwind { +; X86-LABEL: isnone_f: +; X86: # %bb.0: # %entry +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl ; ; X64-LABEL: isnone_f: ; X64: # %bb.0: # %entry @@ -28,11 +28,11 @@ entry: ret i1 %0 } -define i1 @isany_f(float %x) { -; X86-SDAGISEL-LABEL: isany_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movb $1, %al -; X86-SDAGISEL-NEXT: retl +define i1 @isany_f(float %x) nounwind { +; X86-LABEL: isany_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movb $1, %al +; X86-NEXT: retl ; ; X64-LABEL: isany_f: ; X64: # %bb.0: # %entry @@ -50,17 +50,17 @@ entry: ret i1 %0 } -define i1 @issignaling_f(float %x) { -; X86-SDAGISEL-LABEL: issignaling_f: -; X86-SDAGISEL: # %bb.0: -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-SDAGISEL-NEXT: setl %cl -; X86-SDAGISEL-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; X86-SDAGISEL-NEXT: setge %al -; X86-SDAGISEL-NEXT: andb %cl, %al -; X86-SDAGISEL-NEXT: retl +define i1 @issignaling_f(float %x) nounwind { +; X86-LABEL: issignaling_f: +; X86: # %bb.0: +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setl %cl +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setge %al +; X86-NEXT: andb %cl, %al +; X86-NEXT: retl ; ; X64-LABEL: issignaling_f: ; X64: # %bb.0: @@ -76,7 +76,6 @@ define i1 @issignaling_f(float %x) { ; X86-FASTISEL-LABEL: issignaling_f: ; X86-FASTISEL: # %bb.0: ; X86-FASTISEL-NEXT: pushl %eax -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 ; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) ; X86-FASTISEL-NEXT: fstps (%esp) ; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF @@ -87,20 +86,19 @@ define i1 @issignaling_f(float %x) { ; X86-FASTISEL-NEXT: setge %al ; X86-FASTISEL-NEXT: andb %cl, %al ; X86-FASTISEL-NEXT: popl %ecx -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 ; X86-FASTISEL-NEXT: retl %a0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1) ; "snan" ret i1 %a0 } - define i1 @isquiet_f(float %x) { -; X86-SDAGISEL-LABEL: isquiet_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-SDAGISEL-NEXT: setge %al -; X86-SDAGISEL-NEXT: retl + define i1 @isquiet_f(float %x) nounwind { +; X86-LABEL: isquiet_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setge %al +; X86-NEXT: retl ; ; X64-LABEL: isquiet_f: ; X64: # %bb.0: # %entry @@ -113,7 +111,6 @@ define i1 @issignaling_f(float %x) { ; X86-FASTISEL-LABEL: isquiet_f: ; X86-FASTISEL: # %bb.0: # %entry ; X86-FASTISEL-NEXT: pushl %eax -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 ; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) ; X86-FASTISEL-NEXT: fstps (%esp) ; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF @@ -121,21 +118,20 @@ define i1 @issignaling_f(float %x) { ; X86-FASTISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 ; X86-FASTISEL-NEXT: setge %al ; X86-FASTISEL-NEXT: popl %ecx -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 ; X86-FASTISEL-NEXT: retl entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 2) ; "qnan" ret i1 %0 } -define i1 @not_isquiet_f(float %x) { -; X86-SDAGISEL-LABEL: not_isquiet_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-SDAGISEL-NEXT: setl %al -; X86-SDAGISEL-NEXT: retl +define i1 @not_isquiet_f(float %x) nounwind { +; X86-LABEL: not_isquiet_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setl %al +; X86-NEXT: retl ; ; X64-LABEL: not_isquiet_f: ; X64: # %bb.0: # %entry @@ -148,7 +144,6 @@ define i1 @not_isquiet_f(float %x) { ; X86-FASTISEL-LABEL: not_isquiet_f: ; X86-FASTISEL: # %bb.0: # %entry ; X86-FASTISEL-NEXT: pushl %eax -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 ; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) ; X86-FASTISEL-NEXT: fstps (%esp) ; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF @@ -156,21 +151,20 @@ define i1 @not_isquiet_f(float %x) { ; X86-FASTISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 ; X86-FASTISEL-NEXT: setl %al ; X86-FASTISEL-NEXT: popl %ecx -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 ; X86-FASTISEL-NEXT: retl entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1021) ; ~"qnan" ret i1 %0 } -define i1 @isinf_f(float %x) { -; X86-SDAGISEL-LABEL: isinf_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: sete %al -; X86-SDAGISEL-NEXT: retl +define i1 @isinf_f(float %x) nounwind { +; X86-LABEL: isinf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %al +; X86-NEXT: retl ; ; X64-LABEL: isinf_f: ; X64: # %bb.0: # %entry @@ -183,7 +177,6 @@ define i1 @isinf_f(float %x) { ; X86-FASTISEL-LABEL: isinf_f: ; X86-FASTISEL: # %bb.0: # %entry ; X86-FASTISEL-NEXT: pushl %eax -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 ; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) ; X86-FASTISEL-NEXT: fstps (%esp) ; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF @@ -191,21 +184,20 @@ define i1 @isinf_f(float %x) { ; X86-FASTISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 ; X86-FASTISEL-NEXT: sete %al ; X86-FASTISEL-NEXT: popl %ecx -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 ; X86-FASTISEL-NEXT: retl entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 516) ; 0x204 = "inf" ret i1 %0 } -define i1 @not_isinf_f(float %x) { -; X86-SDAGISEL-LABEL: not_isinf_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: setne %al -; X86-SDAGISEL-NEXT: retl +define i1 @not_isinf_f(float %x) nounwind { +; X86-LABEL: not_isinf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setne %al +; X86-NEXT: retl ; ; X64-LABEL: not_isinf_f: ; X64: # %bb.0: # %entry @@ -218,7 +210,6 @@ define i1 @not_isinf_f(float %x) { ; X86-FASTISEL-LABEL: not_isinf_f: ; X86-FASTISEL: # %bb.0: # %entry ; X86-FASTISEL-NEXT: pushl %eax -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 ; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) ; X86-FASTISEL-NEXT: fstps (%esp) ; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF @@ -226,19 +217,18 @@ define i1 @not_isinf_f(float %x) { ; X86-FASTISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 ; X86-FASTISEL-NEXT: setne %al ; X86-FASTISEL-NEXT: popl %ecx -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 ; X86-FASTISEL-NEXT: retl entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 507) ; ~0x204 = "~inf" ret i1 %0 } -define i1 @is_plus_inf_f(float %x) { -; X86-SDAGISEL-LABEL: is_plus_inf_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: sete %al -; X86-SDAGISEL-NEXT: retl +define i1 @is_plus_inf_f(float %x) nounwind { +; X86-LABEL: is_plus_inf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-NEXT: sete %al +; X86-NEXT: retl ; ; X64-LABEL: is_plus_inf_f: ; X64: # %bb.0: # %entry @@ -250,25 +240,23 @@ define i1 @is_plus_inf_f(float %x) { ; X86-FASTISEL-LABEL: is_plus_inf_f: ; X86-FASTISEL: # %bb.0: # %entry ; X86-FASTISEL-NEXT: pushl %eax -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 ; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) ; X86-FASTISEL-NEXT: fstps (%esp) ; X86-FASTISEL-NEXT: cmpl $2139095040, (%esp) # imm = 0x7F800000 ; X86-FASTISEL-NEXT: sete %al ; X86-FASTISEL-NEXT: popl %ecx -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 ; X86-FASTISEL-NEXT: retl entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 512) ; 0x200 = "+inf" ret i1 %0 } -define i1 @is_minus_inf_f(float %x) { -; X86-SDAGISEL-LABEL: is_minus_inf_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 -; X86-SDAGISEL-NEXT: sete %al -; X86-SDAGISEL-NEXT: retl +define i1 @is_minus_inf_f(float %x) nounwind { +; X86-LABEL: is_minus_inf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 +; X86-NEXT: sete %al +; X86-NEXT: retl ; ; X64-LABEL: is_minus_inf_f: ; X64: # %bb.0: # %entry @@ -280,25 +268,23 @@ define i1 @is_minus_inf_f(float %x) { ; X86-FASTISEL-LABEL: is_minus_inf_f: ; X86-FASTISEL: # %bb.0: # %entry ; X86-FASTISEL-NEXT: pushl %eax -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 ; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) ; X86-FASTISEL-NEXT: fstps (%esp) ; X86-FASTISEL-NEXT: cmpl $-8388608, (%esp) # imm = 0xFF800000 ; X86-FASTISEL-NEXT: sete %al ; X86-FASTISEL-NEXT: popl %ecx -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 ; X86-FASTISEL-NEXT: retl entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 4) ; "-inf" ret i1 %0 } -define i1 @not_is_minus_inf_f(float %x) { -; X86-SDAGISEL-LABEL: not_is_minus_inf_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 -; X86-SDAGISEL-NEXT: setne %al -; X86-SDAGISEL-NEXT: retl +define i1 @not_is_minus_inf_f(float %x) nounwind { +; X86-LABEL: not_is_minus_inf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 +; X86-NEXT: setne %al +; X86-NEXT: retl ; ; X64-LABEL: not_is_minus_inf_f: ; X64: # %bb.0: # %entry @@ -310,27 +296,25 @@ define i1 @not_is_minus_inf_f(float %x) { ; X86-FASTISEL-LABEL: not_is_minus_inf_f: ; X86-FASTISEL: # %bb.0: # %entry ; X86-FASTISEL-NEXT: pushl %eax -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 ; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) ; X86-FASTISEL-NEXT: fstps (%esp) ; X86-FASTISEL-NEXT: cmpl $-8388608, (%esp) # imm = 0xFF800000 ; X86-FASTISEL-NEXT: setne %al ; X86-FASTISEL-NEXT: popl %ecx -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 ; X86-FASTISEL-NEXT: retl entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1019) ; ~"-inf" ret i1 %0 } -define i1 @isfinite_f(float %x) { -; X86-SDAGISEL-LABEL: isfinite_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: setl %al -; X86-SDAGISEL-NEXT: retl +define i1 @isfinite_f(float %x) nounwind { +; X86-LABEL: isfinite_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setl %al +; X86-NEXT: retl ; ; X64-LABEL: isfinite_f: ; X64: # %bb.0: # %entry @@ -343,7 +327,6 @@ define i1 @isfinite_f(float %x) { ; X86-FASTISEL-LABEL: isfinite_f: ; X86-FASTISEL: # %bb.0: # %entry ; X86-FASTISEL-NEXT: pushl %eax -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 ; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) ; X86-FASTISEL-NEXT: fstps (%esp) ; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF @@ -351,21 +334,20 @@ define i1 @isfinite_f(float %x) { ; X86-FASTISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 ; X86-FASTISEL-NEXT: setl %al ; X86-FASTISEL-NEXT: popl %ecx -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 ; X86-FASTISEL-NEXT: retl entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504) ; 0x1f8 = "finite" ret i1 %0 } -define i1 @not_isfinite_f(float %x) { -; X86-SDAGISEL-LABEL: not_isfinite_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: setge %al -; X86-SDAGISEL-NEXT: retl +define i1 @not_isfinite_f(float %x) nounwind { +; X86-LABEL: not_isfinite_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setge %al +; X86-NEXT: retl ; ; X64-LABEL: not_isfinite_f: ; X64: # %bb.0: # %entry @@ -378,7 +360,6 @@ define i1 @not_isfinite_f(float %x) { ; X86-FASTISEL-LABEL: not_isfinite_f: ; X86-FASTISEL: # %bb.0: # %entry ; X86-FASTISEL-NEXT: pushl %eax -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 ; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) ; X86-FASTISEL-NEXT: fstps (%esp) ; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF @@ -386,19 +367,18 @@ define i1 @not_isfinite_f(float %x) { ; X86-FASTISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 ; X86-FASTISEL-NEXT: setge %al ; X86-FASTISEL-NEXT: popl %ecx -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 ; X86-FASTISEL-NEXT: retl entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519) ; ~0x1f8 = "~finite" ret i1 %0 } -define i1 @is_plus_finite_f(float %x) { -; X86-SDAGISEL-LABEL: is_plus_finite_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: setb %al -; X86-SDAGISEL-NEXT: retl +define i1 @is_plus_finite_f(float %x) nounwind { +; X86-LABEL: is_plus_finite_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-NEXT: setb %al +; X86-NEXT: retl ; ; X64-LABEL: is_plus_finite_f: ; X64: # %bb.0: # %entry @@ -410,13 +390,11 @@ define i1 @is_plus_finite_f(float %x) { ; X86-FASTISEL-LABEL: is_plus_finite_f: ; X86-FASTISEL: # %bb.0: # %entry ; X86-FASTISEL-NEXT: pushl %eax -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8 ; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp) ; X86-FASTISEL-NEXT: fstps (%esp) ; X86-FASTISEL-NEXT: cmpl $2139095040, (%esp) # imm = 0x7F800000 ; X86-FASTISEL-NEXT: setb %al ; X86-FASTISEL-NEXT: popl %ecx -; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4 ; X86-FASTISEL-NEXT: retl entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 448) ; 0x1c0 = "+finite" @@ -424,10 +402,10 @@ entry: } define i1 @isnone_d(double %x) nounwind { -; X86-SDAGISEL-LABEL: isnone_d: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: xorl %eax, %eax -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: isnone_d: +; X86: # %bb.0: # %entry +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl ; ; X64-LABEL: isnone_d: ; X64: # %bb.0: # %entry @@ -446,10 +424,10 @@ entry: } define i1 @isany_d(double %x) nounwind { -; X86-SDAGISEL-LABEL: isany_d: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movb $1, %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: isany_d: +; X86: # %bb.0: # %entry +; X86-NEXT: movb $1, %al +; X86-NEXT: retl ; ; X64-LABEL: isany_d: ; X64: # %bb.0: # %entry @@ -468,10 +446,10 @@ entry: } define i1 @isnone_f80(x86_fp80 %x) nounwind { -; X86-SDAGISEL-LABEL: isnone_f80: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: xorl %eax, %eax -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: isnone_f80: +; X86: # %bb.0: # %entry +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl ; ; X64-SDAGISEL-LABEL: isnone_f80: ; X64-SDAGISEL: # %bb.0: # %entry @@ -491,16 +469,21 @@ define i1 @isnone_f80(x86_fp80 %x) nounwind { ; X64-FASTISEL-NEXT: fstp %st(0) ; X64-FASTISEL-NEXT: xorl %eax, %eax ; X64-FASTISEL-NEXT: retq +; +; X64-GISEL-LABEL: isnone_f80: +; X64-GISEL: # %bb.0: # %entry +; X64-GISEL-NEXT: xorl %eax, %eax +; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f80(x86_fp80 %x, i32 0) ret i1 %0 } define i1 @isany_f80(x86_fp80 %x) nounwind { -; X86-SDAGISEL-LABEL: isany_f80: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movb $1, %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: isany_f80: +; X86: # %bb.0: # %entry +; X86-NEXT: movb $1, %al +; X86-NEXT: retl ; ; X64-SDAGISEL-LABEL: isany_f80: ; X64-SDAGISEL: # %bb.0: # %entry @@ -520,6 +503,11 @@ define i1 @isany_f80(x86_fp80 %x) nounwind { ; X64-FASTISEL-NEXT: fstp %st(0) ; X64-FASTISEL-NEXT: movb $1, %al ; X64-FASTISEL-NEXT: retq +; +; X64-GISEL-LABEL: isany_f80: +; X64-GISEL: # %bb.0: # %entry +; X64-GISEL-NEXT: movb $1, %al +; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f80(x86_fp80 %x, i32 1023) ret i1 %0 diff --git a/llvm/test/CodeGen/X86/isel-smax.ll b/llvm/test/CodeGen/X86/isel-smax.ll index 9c9a48e..1ce0a80 100644 --- a/llvm/test/CodeGen/X86/isel-smax.ll +++ b/llvm/test/CodeGen/X86/isel-smax.ll @@ -1,19 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64 -; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X64 -; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64 -; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86 -; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X86 -; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64,DAG-X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64 +; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86,DAG-X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86,FASTISEL-X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86 define i8 @smax_i8(i8 %a, i8 %b) nounwind readnone { -; X64-LABEL: smax_i8: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: cmpb %al, %dil -; X64-NEXT: cmovgl %edi, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: retq +; DAG-X64-LABEL: smax_i8: +; DAG-X64: # %bb.0: +; DAG-X64-NEXT: movl %esi, %eax +; DAG-X64-NEXT: cmpb %al, %dil +; DAG-X64-NEXT: cmovgl %edi, %eax +; DAG-X64-NEXT: # kill: def $al killed $al killed $eax +; DAG-X64-NEXT: retq ; ; FASTISEL-X64-LABEL: smax_i8: ; FASTISEL-X64: # %bb.0: @@ -24,6 +24,17 @@ define i8 @smax_i8(i8 %a, i8 %b) nounwind readnone { ; FASTISEL-X64-NEXT: # kill: def $al killed $al killed $eax ; FASTISEL-X64-NEXT: retq ; +; GISEL-X64-LABEL: smax_i8: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movl %esi, %eax +; GISEL-X64-NEXT: xorl %ecx, %ecx +; GISEL-X64-NEXT: cmpb %al, %dil +; GISEL-X64-NEXT: setg %cl +; GISEL-X64-NEXT: andl $1, %ecx +; GISEL-X64-NEXT: cmovnew %di, %ax +; GISEL-X64-NEXT: # kill: def $al killed $al killed $eax +; GISEL-X64-NEXT: retq +; ; X86-LABEL: smax_i8: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx @@ -35,16 +46,20 @@ define i8 @smax_i8(i8 %a, i8 %b) nounwind readnone { ; X86-NEXT: .LBB0_2: ; X86-NEXT: retl ; -; FASTISEL-X86-LABEL: smax_i8: -; FASTISEL-X86: # %bb.0: -; FASTISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; FASTISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; FASTISEL-X86-NEXT: cmpb %cl, %al -; FASTISEL-X86-NEXT: jg .LBB0_2 -; FASTISEL-X86-NEXT: # %bb.1: -; FASTISEL-X86-NEXT: movl %ecx, %eax -; FASTISEL-X86-NEXT: .LBB0_2: -; FASTISEL-X86-NEXT: retl +; GISEL-X86-LABEL: smax_i8: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: xorl %edx, %edx +; GISEL-X86-NEXT: cmpb %al, %cl +; GISEL-X86-NEXT: setg %dl +; GISEL-X86-NEXT: andl $1, %edx +; GISEL-X86-NEXT: je .LBB0_2 +; GISEL-X86-NEXT: # %bb.1: +; GISEL-X86-NEXT: movl %ecx, %eax +; GISEL-X86-NEXT: .LBB0_2: +; GISEL-X86-NEXT: # kill: def $al killed $al killed $eax +; GISEL-X86-NEXT: retl %ret = call i8 @llvm.smax.i8(i8 %a, i8 %b) ret i8 %ret } @@ -57,25 +72,28 @@ define i16 @smax_i16(i16 %a, i16 %b) nounwind readnone { ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; -; FASTISEL-X64-LABEL: smax_i16: -; FASTISEL-X64: # %bb.0: -; FASTISEL-X64-NEXT: movl %esi, %eax -; FASTISEL-X64-NEXT: cmpw %ax, %di -; FASTISEL-X64-NEXT: cmovgl %edi, %eax -; FASTISEL-X64-NEXT: # kill: def $ax killed $ax killed $eax -; FASTISEL-X64-NEXT: retq +; GISEL-X64-LABEL: smax_i16: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movl %edi, %eax +; GISEL-X64-NEXT: xorl %ecx, %ecx +; GISEL-X64-NEXT: cmpw %si, %ax +; GISEL-X64-NEXT: setg %cl +; GISEL-X64-NEXT: andl $1, %ecx +; GISEL-X64-NEXT: cmovew %si, %ax +; GISEL-X64-NEXT: # kill: def $ax killed $ax killed $eax +; GISEL-X64-NEXT: retq ; -; X86-LABEL: smax_i16: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %cx, %ax -; X86-NEXT: jg .LBB1_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB1_2: -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: retl +; DAG-X86-LABEL: smax_i16: +; DAG-X86: # %bb.0: +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; DAG-X86-NEXT: cmpw %cx, %ax +; DAG-X86-NEXT: jg .LBB1_2 +; DAG-X86-NEXT: # %bb.1: +; DAG-X86-NEXT: movl %ecx, %eax +; DAG-X86-NEXT: .LBB1_2: +; DAG-X86-NEXT: # kill: def $ax killed $ax killed $eax +; DAG-X86-NEXT: retl ; ; FASTISEL-X86-LABEL: smax_i16: ; FASTISEL-X86: # %bb.0: @@ -88,6 +106,21 @@ define i16 @smax_i16(i16 %a, i16 %b) nounwind readnone { ; FASTISEL-X86-NEXT: .LBB1_2: ; FASTISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax ; FASTISEL-X86-NEXT: retl +; +; GISEL-X86-LABEL: smax_i16: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: xorl %edx, %edx +; GISEL-X86-NEXT: cmpw %ax, %cx +; GISEL-X86-NEXT: setg %dl +; GISEL-X86-NEXT: andl $1, %edx +; GISEL-X86-NEXT: je .LBB1_2 +; GISEL-X86-NEXT: # %bb.1: +; GISEL-X86-NEXT: movl %ecx, %eax +; GISEL-X86-NEXT: .LBB1_2: +; GISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax +; GISEL-X86-NEXT: retl %ret = call i16 @llvm.smax.i16(i16 %a, i16 %b) ret i16 %ret } @@ -99,12 +132,15 @@ define i32 @smax_i32(i32 %a, i32 %b) nounwind readnone { ; X64-NEXT: cmovgl %edi, %eax ; X64-NEXT: retq ; -; FASTISEL-X64-LABEL: smax_i32: -; FASTISEL-X64: # %bb.0: -; FASTISEL-X64-NEXT: movl %esi, %eax -; FASTISEL-X64-NEXT: cmpl %esi, %edi -; FASTISEL-X64-NEXT: cmovgl %edi, %eax -; FASTISEL-X64-NEXT: retq +; GISEL-X64-LABEL: smax_i32: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movl %edi, %eax +; GISEL-X64-NEXT: xorl %ecx, %ecx +; GISEL-X64-NEXT: cmpl %esi, %edi +; GISEL-X64-NEXT: setg %cl +; GISEL-X64-NEXT: andl $1, %ecx +; GISEL-X64-NEXT: cmovel %esi, %eax +; GISEL-X64-NEXT: retq ; ; X86-LABEL: smax_i32: ; X86: # %bb.0: @@ -117,16 +153,19 @@ define i32 @smax_i32(i32 %a, i32 %b) nounwind readnone { ; X86-NEXT: .LBB2_2: ; X86-NEXT: retl ; -; FASTISEL-X86-LABEL: smax_i32: -; FASTISEL-X86: # %bb.0: -; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; FASTISEL-X86-NEXT: cmpl %ecx, %eax -; FASTISEL-X86-NEXT: jg .LBB2_2 -; FASTISEL-X86-NEXT: # %bb.1: -; FASTISEL-X86-NEXT: movl %ecx, %eax -; FASTISEL-X86-NEXT: .LBB2_2: -; FASTISEL-X86-NEXT: retl +; GISEL-X86-LABEL: smax_i32: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: xorl %edx, %edx +; GISEL-X86-NEXT: cmpl %eax, %ecx +; GISEL-X86-NEXT: setg %dl +; GISEL-X86-NEXT: andl $1, %edx +; GISEL-X86-NEXT: je .LBB2_2 +; GISEL-X86-NEXT: # %bb.1: +; GISEL-X86-NEXT: movl %ecx, %eax +; GISEL-X86-NEXT: .LBB2_2: +; GISEL-X86-NEXT: retl %ret = call i32 @llvm.smax.i32(i32 %a, i32 %b) ret i32 %ret } @@ -138,32 +177,35 @@ define i64 @smax_i64(i64 %a, i64 %b) nounwind readnone { ; X64-NEXT: cmovgq %rdi, %rax ; X64-NEXT: retq ; -; FASTISEL-X64-LABEL: smax_i64: -; FASTISEL-X64: # %bb.0: -; FASTISEL-X64-NEXT: movq %rsi, %rax -; FASTISEL-X64-NEXT: cmpq %rsi, %rdi -; FASTISEL-X64-NEXT: cmovgq %rdi, %rax -; FASTISEL-X64-NEXT: retq +; GISEL-X64-LABEL: smax_i64: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movq %rdi, %rax +; GISEL-X64-NEXT: xorl %ecx, %ecx +; GISEL-X64-NEXT: cmpq %rsi, %rdi +; GISEL-X64-NEXT: setg %cl +; GISEL-X64-NEXT: andl $1, %ecx +; GISEL-X64-NEXT: cmoveq %rsi, %rax +; GISEL-X64-NEXT: retq ; -; X86-LABEL: smax_i64: -; X86: # %bb.0: -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl %eax, %ecx -; X86-NEXT: movl %esi, %edi -; X86-NEXT: sbbl %edx, %edi -; X86-NEXT: jl .LBB3_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %esi, %edx -; X86-NEXT: .LBB3_2: -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: retl +; DAG-X86-LABEL: smax_i64: +; DAG-X86: # %bb.0: +; DAG-X86-NEXT: pushl %edi +; DAG-X86-NEXT: pushl %esi +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; DAG-X86-NEXT: cmpl %eax, %ecx +; DAG-X86-NEXT: movl %esi, %edi +; DAG-X86-NEXT: sbbl %edx, %edi +; DAG-X86-NEXT: jl .LBB3_2 +; DAG-X86-NEXT: # %bb.1: +; DAG-X86-NEXT: movl %ecx, %eax +; DAG-X86-NEXT: movl %esi, %edx +; DAG-X86-NEXT: .LBB3_2: +; DAG-X86-NEXT: popl %esi +; DAG-X86-NEXT: popl %edi +; DAG-X86-NEXT: retl ; ; FASTISEL-X86-LABEL: smax_i64: ; FASTISEL-X86: # %bb.0: @@ -184,6 +226,44 @@ define i64 @smax_i64(i64 %a, i64 %b) nounwind readnone { ; FASTISEL-X86-NEXT: popl %esi ; FASTISEL-X86-NEXT: popl %edi ; FASTISEL-X86-NEXT: retl +; +; GISEL-X86-LABEL: smax_i64: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: pushl %ebp +; GISEL-X86-NEXT: pushl %ebx +; GISEL-X86-NEXT: pushl %edi +; GISEL-X86-NEXT: pushl %esi +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; GISEL-X86-NEXT: cmpl %eax, %esi +; GISEL-X86-NEXT: seta %bl +; GISEL-X86-NEXT: xorl %ecx, %ecx +; GISEL-X86-NEXT: cmpl %edx, %ebp +; GISEL-X86-NEXT: setg %bh +; GISEL-X86-NEXT: sete %cl +; GISEL-X86-NEXT: testl %ecx, %ecx +; GISEL-X86-NEXT: je .LBB3_2 +; GISEL-X86-NEXT: # %bb.1: +; GISEL-X86-NEXT: movb %bl, %bh +; GISEL-X86-NEXT: .LBB3_2: +; GISEL-X86-NEXT: movzbl %bh, %edi +; GISEL-X86-NEXT: andl $1, %edi +; GISEL-X86-NEXT: je .LBB3_4 +; GISEL-X86-NEXT: # %bb.3: +; GISEL-X86-NEXT: movl %esi, %eax +; GISEL-X86-NEXT: .LBB3_4: +; GISEL-X86-NEXT: testl %edi, %edi +; GISEL-X86-NEXT: je .LBB3_6 +; GISEL-X86-NEXT: # %bb.5: +; GISEL-X86-NEXT: movl %ebp, %edx +; GISEL-X86-NEXT: .LBB3_6: +; GISEL-X86-NEXT: popl %esi +; GISEL-X86-NEXT: popl %edi +; GISEL-X86-NEXT: popl %ebx +; GISEL-X86-NEXT: popl %ebp +; GISEL-X86-NEXT: retl %ret = call i64 @llvm.smax.i64(i64 %a, i64 %b) ret i64 %ret } diff --git a/llvm/test/CodeGen/X86/isel-smin.ll b/llvm/test/CodeGen/X86/isel-smin.ll index 7349a7c..bbed3c3 100644 --- a/llvm/test/CodeGen/X86/isel-smin.ll +++ b/llvm/test/CodeGen/X86/isel-smin.ll @@ -1,19 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64 -; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X64 -; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64 -; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86 -; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X86 -; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64,DAG-X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64 +; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86,DAG-X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86,FASTISEL-X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86 define i8 @smin_i8(i8 %a, i8 %b) nounwind readnone { -; X64-LABEL: smin_i8: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: cmpb %al, %dil -; X64-NEXT: cmovll %edi, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: retq +; DAG-X64-LABEL: smin_i8: +; DAG-X64: # %bb.0: +; DAG-X64-NEXT: movl %esi, %eax +; DAG-X64-NEXT: cmpb %al, %dil +; DAG-X64-NEXT: cmovll %edi, %eax +; DAG-X64-NEXT: # kill: def $al killed $al killed $eax +; DAG-X64-NEXT: retq ; ; FASTISEL-X64-LABEL: smin_i8: ; FASTISEL-X64: # %bb.0: @@ -24,6 +24,17 @@ define i8 @smin_i8(i8 %a, i8 %b) nounwind readnone { ; FASTISEL-X64-NEXT: # kill: def $al killed $al killed $eax ; FASTISEL-X64-NEXT: retq ; +; GISEL-X64-LABEL: smin_i8: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movl %esi, %eax +; GISEL-X64-NEXT: xorl %ecx, %ecx +; GISEL-X64-NEXT: cmpb %al, %dil +; GISEL-X64-NEXT: setl %cl +; GISEL-X64-NEXT: andl $1, %ecx +; GISEL-X64-NEXT: cmovnew %di, %ax +; GISEL-X64-NEXT: # kill: def $al killed $al killed $eax +; GISEL-X64-NEXT: retq +; ; X86-LABEL: smin_i8: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx @@ -35,16 +46,20 @@ define i8 @smin_i8(i8 %a, i8 %b) nounwind readnone { ; X86-NEXT: .LBB0_2: ; X86-NEXT: retl ; -; FASTISEL-X86-LABEL: smin_i8: -; FASTISEL-X86: # %bb.0: -; FASTISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; FASTISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; FASTISEL-X86-NEXT: cmpb %cl, %al -; FASTISEL-X86-NEXT: jl .LBB0_2 -; FASTISEL-X86-NEXT: # %bb.1: -; FASTISEL-X86-NEXT: movl %ecx, %eax -; FASTISEL-X86-NEXT: .LBB0_2: -; FASTISEL-X86-NEXT: retl +; GISEL-X86-LABEL: smin_i8: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: xorl %edx, %edx +; GISEL-X86-NEXT: cmpb %al, %cl +; GISEL-X86-NEXT: setl %dl +; GISEL-X86-NEXT: andl $1, %edx +; GISEL-X86-NEXT: je .LBB0_2 +; GISEL-X86-NEXT: # %bb.1: +; GISEL-X86-NEXT: movl %ecx, %eax +; GISEL-X86-NEXT: .LBB0_2: +; GISEL-X86-NEXT: # kill: def $al killed $al killed $eax +; GISEL-X86-NEXT: retl %ret = call i8 @llvm.smin.i8(i8 %a, i8 %b) ret i8 %ret } @@ -57,25 +72,28 @@ define i16 @smin_i16(i16 %a, i16 %b) nounwind readnone { ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; -; FASTISEL-X64-LABEL: smin_i16: -; FASTISEL-X64: # %bb.0: -; FASTISEL-X64-NEXT: movl %esi, %eax -; FASTISEL-X64-NEXT: cmpw %ax, %di -; FASTISEL-X64-NEXT: cmovll %edi, %eax -; FASTISEL-X64-NEXT: # kill: def $ax killed $ax killed $eax -; FASTISEL-X64-NEXT: retq +; GISEL-X64-LABEL: smin_i16: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movl %edi, %eax +; GISEL-X64-NEXT: xorl %ecx, %ecx +; GISEL-X64-NEXT: cmpw %si, %ax +; GISEL-X64-NEXT: setl %cl +; GISEL-X64-NEXT: andl $1, %ecx +; GISEL-X64-NEXT: cmovew %si, %ax +; GISEL-X64-NEXT: # kill: def $ax killed $ax killed $eax +; GISEL-X64-NEXT: retq ; -; X86-LABEL: smin_i16: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %cx, %ax -; X86-NEXT: jl .LBB1_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB1_2: -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: retl +; DAG-X86-LABEL: smin_i16: +; DAG-X86: # %bb.0: +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; DAG-X86-NEXT: cmpw %cx, %ax +; DAG-X86-NEXT: jl .LBB1_2 +; DAG-X86-NEXT: # %bb.1: +; DAG-X86-NEXT: movl %ecx, %eax +; DAG-X86-NEXT: .LBB1_2: +; DAG-X86-NEXT: # kill: def $ax killed $ax killed $eax +; DAG-X86-NEXT: retl ; ; FASTISEL-X86-LABEL: smin_i16: ; FASTISEL-X86: # %bb.0: @@ -88,6 +106,21 @@ define i16 @smin_i16(i16 %a, i16 %b) nounwind readnone { ; FASTISEL-X86-NEXT: .LBB1_2: ; FASTISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax ; FASTISEL-X86-NEXT: retl +; +; GISEL-X86-LABEL: smin_i16: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: xorl %edx, %edx +; GISEL-X86-NEXT: cmpw %ax, %cx +; GISEL-X86-NEXT: setl %dl +; GISEL-X86-NEXT: andl $1, %edx +; GISEL-X86-NEXT: je .LBB1_2 +; GISEL-X86-NEXT: # %bb.1: +; GISEL-X86-NEXT: movl %ecx, %eax +; GISEL-X86-NEXT: .LBB1_2: +; GISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax +; GISEL-X86-NEXT: retl %ret = call i16 @llvm.smin.i16(i16 %a, i16 %b) ret i16 %ret } @@ -99,12 +132,15 @@ define i32 @smin_i32(i32 %a, i32 %b) nounwind readnone { ; X64-NEXT: cmovll %edi, %eax ; X64-NEXT: retq ; -; FASTISEL-X64-LABEL: smin_i32: -; FASTISEL-X64: # %bb.0: -; FASTISEL-X64-NEXT: movl %esi, %eax -; FASTISEL-X64-NEXT: cmpl %esi, %edi -; FASTISEL-X64-NEXT: cmovll %edi, %eax -; FASTISEL-X64-NEXT: retq +; GISEL-X64-LABEL: smin_i32: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movl %edi, %eax +; GISEL-X64-NEXT: xorl %ecx, %ecx +; GISEL-X64-NEXT: cmpl %esi, %edi +; GISEL-X64-NEXT: setl %cl +; GISEL-X64-NEXT: andl $1, %ecx +; GISEL-X64-NEXT: cmovel %esi, %eax +; GISEL-X64-NEXT: retq ; ; X86-LABEL: smin_i32: ; X86: # %bb.0: @@ -117,16 +153,19 @@ define i32 @smin_i32(i32 %a, i32 %b) nounwind readnone { ; X86-NEXT: .LBB2_2: ; X86-NEXT: retl ; -; FASTISEL-X86-LABEL: smin_i32: -; FASTISEL-X86: # %bb.0: -; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; FASTISEL-X86-NEXT: cmpl %ecx, %eax -; FASTISEL-X86-NEXT: jl .LBB2_2 -; FASTISEL-X86-NEXT: # %bb.1: -; FASTISEL-X86-NEXT: movl %ecx, %eax -; FASTISEL-X86-NEXT: .LBB2_2: -; FASTISEL-X86-NEXT: retl +; GISEL-X86-LABEL: smin_i32: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: xorl %edx, %edx +; GISEL-X86-NEXT: cmpl %eax, %ecx +; GISEL-X86-NEXT: setl %dl +; GISEL-X86-NEXT: andl $1, %edx +; GISEL-X86-NEXT: je .LBB2_2 +; GISEL-X86-NEXT: # %bb.1: +; GISEL-X86-NEXT: movl %ecx, %eax +; GISEL-X86-NEXT: .LBB2_2: +; GISEL-X86-NEXT: retl %ret = call i32 @llvm.smin.i32(i32 %a, i32 %b) ret i32 %ret } @@ -138,32 +177,35 @@ define i64 @smin_i64(i64 %a, i64 %b) nounwind readnone { ; X64-NEXT: cmovlq %rdi, %rax ; X64-NEXT: retq ; -; FASTISEL-X64-LABEL: smin_i64: -; FASTISEL-X64: # %bb.0: -; FASTISEL-X64-NEXT: movq %rsi, %rax -; FASTISEL-X64-NEXT: cmpq %rsi, %rdi -; FASTISEL-X64-NEXT: cmovlq %rdi, %rax -; FASTISEL-X64-NEXT: retq +; GISEL-X64-LABEL: smin_i64: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movq %rdi, %rax +; GISEL-X64-NEXT: xorl %ecx, %ecx +; GISEL-X64-NEXT: cmpq %rsi, %rdi +; GISEL-X64-NEXT: setl %cl +; GISEL-X64-NEXT: andl $1, %ecx +; GISEL-X64-NEXT: cmoveq %rsi, %rax +; GISEL-X64-NEXT: retq ; -; X86-LABEL: smin_i64: -; X86: # %bb.0: -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: movl %edx, %edi -; X86-NEXT: sbbl %esi, %edi -; X86-NEXT: jl .LBB3_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %esi, %edx -; X86-NEXT: .LBB3_2: -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: retl +; DAG-X86-LABEL: smin_i64: +; DAG-X86: # %bb.0: +; DAG-X86-NEXT: pushl %edi +; DAG-X86-NEXT: pushl %esi +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; DAG-X86-NEXT: cmpl %ecx, %eax +; DAG-X86-NEXT: movl %edx, %edi +; DAG-X86-NEXT: sbbl %esi, %edi +; DAG-X86-NEXT: jl .LBB3_2 +; DAG-X86-NEXT: # %bb.1: +; DAG-X86-NEXT: movl %ecx, %eax +; DAG-X86-NEXT: movl %esi, %edx +; DAG-X86-NEXT: .LBB3_2: +; DAG-X86-NEXT: popl %esi +; DAG-X86-NEXT: popl %edi +; DAG-X86-NEXT: retl ; ; FASTISEL-X86-LABEL: smin_i64: ; FASTISEL-X86: # %bb.0: @@ -184,6 +226,44 @@ define i64 @smin_i64(i64 %a, i64 %b) nounwind readnone { ; FASTISEL-X86-NEXT: popl %esi ; FASTISEL-X86-NEXT: popl %edi ; FASTISEL-X86-NEXT: retl +; +; GISEL-X86-LABEL: smin_i64: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: pushl %ebp +; GISEL-X86-NEXT: pushl %ebx +; GISEL-X86-NEXT: pushl %edi +; GISEL-X86-NEXT: pushl %esi +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; GISEL-X86-NEXT: cmpl %eax, %esi +; GISEL-X86-NEXT: setb %bl +; GISEL-X86-NEXT: xorl %ecx, %ecx +; GISEL-X86-NEXT: cmpl %edx, %ebp +; GISEL-X86-NEXT: setl %bh +; GISEL-X86-NEXT: sete %cl +; GISEL-X86-NEXT: testl %ecx, %ecx +; GISEL-X86-NEXT: je .LBB3_2 +; GISEL-X86-NEXT: # %bb.1: +; GISEL-X86-NEXT: movb %bl, %bh +; GISEL-X86-NEXT: .LBB3_2: +; GISEL-X86-NEXT: movzbl %bh, %edi +; GISEL-X86-NEXT: andl $1, %edi +; GISEL-X86-NEXT: je .LBB3_4 +; GISEL-X86-NEXT: # %bb.3: +; GISEL-X86-NEXT: movl %esi, %eax +; GISEL-X86-NEXT: .LBB3_4: +; GISEL-X86-NEXT: testl %edi, %edi +; GISEL-X86-NEXT: je .LBB3_6 +; GISEL-X86-NEXT: # %bb.5: +; GISEL-X86-NEXT: movl %ebp, %edx +; GISEL-X86-NEXT: .LBB3_6: +; GISEL-X86-NEXT: popl %esi +; GISEL-X86-NEXT: popl %edi +; GISEL-X86-NEXT: popl %ebx +; GISEL-X86-NEXT: popl %ebp +; GISEL-X86-NEXT: retl %ret = call i64 @llvm.smin.i64(i64 %a, i64 %b) ret i64 %ret } diff --git a/llvm/test/CodeGen/X86/isel-umax.ll b/llvm/test/CodeGen/X86/isel-umax.ll index a90456c..990af26 100644 --- a/llvm/test/CodeGen/X86/isel-umax.ll +++ b/llvm/test/CodeGen/X86/isel-umax.ll @@ -1,19 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64 -; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X64 -; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64 -; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86 -; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X86 -; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64,DAG-X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64 +; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86,DAG-X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86,FASTISEL-X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86 define i8 @umax_i8(i8 %a, i8 %b) nounwind readnone { -; X64-LABEL: umax_i8: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: cmpb %al, %dil -; X64-NEXT: cmoval %edi, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: retq +; DAG-X64-LABEL: umax_i8: +; DAG-X64: # %bb.0: +; DAG-X64-NEXT: movl %esi, %eax +; DAG-X64-NEXT: cmpb %al, %dil +; DAG-X64-NEXT: cmoval %edi, %eax +; DAG-X64-NEXT: # kill: def $al killed $al killed $eax +; DAG-X64-NEXT: retq ; ; FASTISEL-X64-LABEL: umax_i8: ; FASTISEL-X64: # %bb.0: @@ -24,6 +24,17 @@ define i8 @umax_i8(i8 %a, i8 %b) nounwind readnone { ; FASTISEL-X64-NEXT: # kill: def $al killed $al killed $eax ; FASTISEL-X64-NEXT: retq ; +; GISEL-X64-LABEL: umax_i8: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movl %esi, %eax +; GISEL-X64-NEXT: xorl %ecx, %ecx +; GISEL-X64-NEXT: cmpb %al, %dil +; GISEL-X64-NEXT: seta %cl +; GISEL-X64-NEXT: andl $1, %ecx +; GISEL-X64-NEXT: cmovnew %di, %ax +; GISEL-X64-NEXT: # kill: def $al killed $al killed $eax +; GISEL-X64-NEXT: retq +; ; X86-LABEL: umax_i8: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx @@ -35,16 +46,20 @@ define i8 @umax_i8(i8 %a, i8 %b) nounwind readnone { ; X86-NEXT: .LBB0_2: ; X86-NEXT: retl ; -; FASTISEL-X86-LABEL: umax_i8: -; FASTISEL-X86: # %bb.0: -; FASTISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; FASTISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; FASTISEL-X86-NEXT: cmpb %cl, %al -; FASTISEL-X86-NEXT: ja .LBB0_2 -; FASTISEL-X86-NEXT: # %bb.1: -; FASTISEL-X86-NEXT: movl %ecx, %eax -; FASTISEL-X86-NEXT: .LBB0_2: -; FASTISEL-X86-NEXT: retl +; GISEL-X86-LABEL: umax_i8: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: xorl %edx, %edx +; GISEL-X86-NEXT: cmpb %al, %cl +; GISEL-X86-NEXT: seta %dl +; GISEL-X86-NEXT: andl $1, %edx +; GISEL-X86-NEXT: je .LBB0_2 +; GISEL-X86-NEXT: # %bb.1: +; GISEL-X86-NEXT: movl %ecx, %eax +; GISEL-X86-NEXT: .LBB0_2: +; GISEL-X86-NEXT: # kill: def $al killed $al killed $eax +; GISEL-X86-NEXT: retl %ret = call i8 @llvm.umax.i8(i8 %a, i8 %b) ret i8 %ret } @@ -57,25 +72,28 @@ define i16 @umax_i16(i16 %a, i16 %b) nounwind readnone { ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; -; FASTISEL-X64-LABEL: umax_i16: -; FASTISEL-X64: # %bb.0: -; FASTISEL-X64-NEXT: movl %esi, %eax -; FASTISEL-X64-NEXT: cmpw %ax, %di -; FASTISEL-X64-NEXT: cmoval %edi, %eax -; FASTISEL-X64-NEXT: # kill: def $ax killed $ax killed $eax -; FASTISEL-X64-NEXT: retq +; GISEL-X64-LABEL: umax_i16: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movl %edi, %eax +; GISEL-X64-NEXT: xorl %ecx, %ecx +; GISEL-X64-NEXT: cmpw %si, %ax +; GISEL-X64-NEXT: seta %cl +; GISEL-X64-NEXT: andl $1, %ecx +; GISEL-X64-NEXT: cmovew %si, %ax +; GISEL-X64-NEXT: # kill: def $ax killed $ax killed $eax +; GISEL-X64-NEXT: retq ; -; X86-LABEL: umax_i16: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %cx, %ax -; X86-NEXT: ja .LBB1_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB1_2: -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: retl +; DAG-X86-LABEL: umax_i16: +; DAG-X86: # %bb.0: +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; DAG-X86-NEXT: cmpw %cx, %ax +; DAG-X86-NEXT: ja .LBB1_2 +; DAG-X86-NEXT: # %bb.1: +; DAG-X86-NEXT: movl %ecx, %eax +; DAG-X86-NEXT: .LBB1_2: +; DAG-X86-NEXT: # kill: def $ax killed $ax killed $eax +; DAG-X86-NEXT: retl ; ; FASTISEL-X86-LABEL: umax_i16: ; FASTISEL-X86: # %bb.0: @@ -88,6 +106,21 @@ define i16 @umax_i16(i16 %a, i16 %b) nounwind readnone { ; FASTISEL-X86-NEXT: .LBB1_2: ; FASTISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax ; FASTISEL-X86-NEXT: retl +; +; GISEL-X86-LABEL: umax_i16: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: xorl %edx, %edx +; GISEL-X86-NEXT: cmpw %ax, %cx +; GISEL-X86-NEXT: seta %dl +; GISEL-X86-NEXT: andl $1, %edx +; GISEL-X86-NEXT: je .LBB1_2 +; GISEL-X86-NEXT: # %bb.1: +; GISEL-X86-NEXT: movl %ecx, %eax +; GISEL-X86-NEXT: .LBB1_2: +; GISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax +; GISEL-X86-NEXT: retl %ret = call i16 @llvm.umax.i16(i16 %a, i16 %b) ret i16 %ret } @@ -99,12 +132,15 @@ define i32 @umax_i32(i32 %a, i32 %b) nounwind readnone { ; X64-NEXT: cmoval %edi, %eax ; X64-NEXT: retq ; -; FASTISEL-X64-LABEL: umax_i32: -; FASTISEL-X64: # %bb.0: -; FASTISEL-X64-NEXT: movl %esi, %eax -; FASTISEL-X64-NEXT: cmpl %esi, %edi -; FASTISEL-X64-NEXT: cmoval %edi, %eax -; FASTISEL-X64-NEXT: retq +; GISEL-X64-LABEL: umax_i32: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movl %edi, %eax +; GISEL-X64-NEXT: xorl %ecx, %ecx +; GISEL-X64-NEXT: cmpl %esi, %edi +; GISEL-X64-NEXT: seta %cl +; GISEL-X64-NEXT: andl $1, %ecx +; GISEL-X64-NEXT: cmovel %esi, %eax +; GISEL-X64-NEXT: retq ; ; X86-LABEL: umax_i32: ; X86: # %bb.0: @@ -117,16 +153,19 @@ define i32 @umax_i32(i32 %a, i32 %b) nounwind readnone { ; X86-NEXT: .LBB2_2: ; X86-NEXT: retl ; -; FASTISEL-X86-LABEL: umax_i32: -; FASTISEL-X86: # %bb.0: -; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; FASTISEL-X86-NEXT: cmpl %ecx, %eax -; FASTISEL-X86-NEXT: ja .LBB2_2 -; FASTISEL-X86-NEXT: # %bb.1: -; FASTISEL-X86-NEXT: movl %ecx, %eax -; FASTISEL-X86-NEXT: .LBB2_2: -; FASTISEL-X86-NEXT: retl +; GISEL-X86-LABEL: umax_i32: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: xorl %edx, %edx +; GISEL-X86-NEXT: cmpl %eax, %ecx +; GISEL-X86-NEXT: seta %dl +; GISEL-X86-NEXT: andl $1, %edx +; GISEL-X86-NEXT: je .LBB2_2 +; GISEL-X86-NEXT: # %bb.1: +; GISEL-X86-NEXT: movl %ecx, %eax +; GISEL-X86-NEXT: .LBB2_2: +; GISEL-X86-NEXT: retl %ret = call i32 @llvm.umax.i32(i32 %a, i32 %b) ret i32 %ret } @@ -138,32 +177,35 @@ define i64 @umax_i64(i64 %a, i64 %b) nounwind readnone { ; X64-NEXT: cmovaq %rdi, %rax ; X64-NEXT: retq ; -; FASTISEL-X64-LABEL: umax_i64: -; FASTISEL-X64: # %bb.0: -; FASTISEL-X64-NEXT: movq %rsi, %rax -; FASTISEL-X64-NEXT: cmpq %rsi, %rdi -; FASTISEL-X64-NEXT: cmovaq %rdi, %rax -; FASTISEL-X64-NEXT: retq +; GISEL-X64-LABEL: umax_i64: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movq %rdi, %rax +; GISEL-X64-NEXT: xorl %ecx, %ecx +; GISEL-X64-NEXT: cmpq %rsi, %rdi +; GISEL-X64-NEXT: seta %cl +; GISEL-X64-NEXT: andl $1, %ecx +; GISEL-X64-NEXT: cmoveq %rsi, %rax +; GISEL-X64-NEXT: retq ; -; X86-LABEL: umax_i64: -; X86: # %bb.0: -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl %eax, %ecx -; X86-NEXT: movl %esi, %edi -; X86-NEXT: sbbl %edx, %edi -; X86-NEXT: jb .LBB3_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %esi, %edx -; X86-NEXT: .LBB3_2: -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: retl +; DAG-X86-LABEL: umax_i64: +; DAG-X86: # %bb.0: +; DAG-X86-NEXT: pushl %edi +; DAG-X86-NEXT: pushl %esi +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; DAG-X86-NEXT: cmpl %eax, %ecx +; DAG-X86-NEXT: movl %esi, %edi +; DAG-X86-NEXT: sbbl %edx, %edi +; DAG-X86-NEXT: jb .LBB3_2 +; DAG-X86-NEXT: # %bb.1: +; DAG-X86-NEXT: movl %ecx, %eax +; DAG-X86-NEXT: movl %esi, %edx +; DAG-X86-NEXT: .LBB3_2: +; DAG-X86-NEXT: popl %esi +; DAG-X86-NEXT: popl %edi +; DAG-X86-NEXT: retl ; ; FASTISEL-X86-LABEL: umax_i64: ; FASTISEL-X86: # %bb.0: @@ -184,6 +226,44 @@ define i64 @umax_i64(i64 %a, i64 %b) nounwind readnone { ; FASTISEL-X86-NEXT: popl %esi ; FASTISEL-X86-NEXT: popl %edi ; FASTISEL-X86-NEXT: retl +; +; GISEL-X86-LABEL: umax_i64: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: pushl %ebp +; GISEL-X86-NEXT: pushl %ebx +; GISEL-X86-NEXT: pushl %edi +; GISEL-X86-NEXT: pushl %esi +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; GISEL-X86-NEXT: cmpl %eax, %esi +; GISEL-X86-NEXT: seta %bl +; GISEL-X86-NEXT: xorl %ecx, %ecx +; GISEL-X86-NEXT: cmpl %edx, %ebp +; GISEL-X86-NEXT: seta %bh +; GISEL-X86-NEXT: sete %cl +; GISEL-X86-NEXT: testl %ecx, %ecx +; GISEL-X86-NEXT: je .LBB3_2 +; GISEL-X86-NEXT: # %bb.1: +; GISEL-X86-NEXT: movb %bl, %bh +; GISEL-X86-NEXT: .LBB3_2: +; GISEL-X86-NEXT: movzbl %bh, %edi +; GISEL-X86-NEXT: andl $1, %edi +; GISEL-X86-NEXT: je .LBB3_4 +; GISEL-X86-NEXT: # %bb.3: +; GISEL-X86-NEXT: movl %esi, %eax +; GISEL-X86-NEXT: .LBB3_4: +; GISEL-X86-NEXT: testl %edi, %edi +; GISEL-X86-NEXT: je .LBB3_6 +; GISEL-X86-NEXT: # %bb.5: +; GISEL-X86-NEXT: movl %ebp, %edx +; GISEL-X86-NEXT: .LBB3_6: +; GISEL-X86-NEXT: popl %esi +; GISEL-X86-NEXT: popl %edi +; GISEL-X86-NEXT: popl %ebx +; GISEL-X86-NEXT: popl %ebp +; GISEL-X86-NEXT: retl %ret = call i64 @llvm.umax.i64(i64 %a, i64 %b) ret i64 %ret } diff --git a/llvm/test/CodeGen/X86/isel-umin.ll b/llvm/test/CodeGen/X86/isel-umin.ll index 53a0b27..1710b9f 100644 --- a/llvm/test/CodeGen/X86/isel-umin.ll +++ b/llvm/test/CodeGen/X86/isel-umin.ll @@ -1,19 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64 -; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X64 -; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64 -; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86 -; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X86 -; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64,DAG-X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64 +; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86,DAG-X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86,FASTISEL-X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86 define i8 @umin_i8(i8 %a, i8 %b) nounwind readnone { -; X64-LABEL: umin_i8: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: cmpb %al, %dil -; X64-NEXT: cmovbl %edi, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: retq +; DAG-X64-LABEL: umin_i8: +; DAG-X64: # %bb.0: +; DAG-X64-NEXT: movl %esi, %eax +; DAG-X64-NEXT: cmpb %al, %dil +; DAG-X64-NEXT: cmovbl %edi, %eax +; DAG-X64-NEXT: # kill: def $al killed $al killed $eax +; DAG-X64-NEXT: retq ; ; FASTISEL-X64-LABEL: umin_i8: ; FASTISEL-X64: # %bb.0: @@ -24,6 +24,17 @@ define i8 @umin_i8(i8 %a, i8 %b) nounwind readnone { ; FASTISEL-X64-NEXT: # kill: def $al killed $al killed $eax ; FASTISEL-X64-NEXT: retq ; +; GISEL-X64-LABEL: umin_i8: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movl %esi, %eax +; GISEL-X64-NEXT: xorl %ecx, %ecx +; GISEL-X64-NEXT: cmpb %al, %dil +; GISEL-X64-NEXT: setb %cl +; GISEL-X64-NEXT: andl $1, %ecx +; GISEL-X64-NEXT: cmovnew %di, %ax +; GISEL-X64-NEXT: # kill: def $al killed $al killed $eax +; GISEL-X64-NEXT: retq +; ; X86-LABEL: umin_i8: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx @@ -35,16 +46,20 @@ define i8 @umin_i8(i8 %a, i8 %b) nounwind readnone { ; X86-NEXT: .LBB0_2: ; X86-NEXT: retl ; -; FASTISEL-X86-LABEL: umin_i8: -; FASTISEL-X86: # %bb.0: -; FASTISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; FASTISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; FASTISEL-X86-NEXT: cmpb %cl, %al -; FASTISEL-X86-NEXT: jb .LBB0_2 -; FASTISEL-X86-NEXT: # %bb.1: -; FASTISEL-X86-NEXT: movl %ecx, %eax -; FASTISEL-X86-NEXT: .LBB0_2: -; FASTISEL-X86-NEXT: retl +; GISEL-X86-LABEL: umin_i8: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: xorl %edx, %edx +; GISEL-X86-NEXT: cmpb %al, %cl +; GISEL-X86-NEXT: setb %dl +; GISEL-X86-NEXT: andl $1, %edx +; GISEL-X86-NEXT: je .LBB0_2 +; GISEL-X86-NEXT: # %bb.1: +; GISEL-X86-NEXT: movl %ecx, %eax +; GISEL-X86-NEXT: .LBB0_2: +; GISEL-X86-NEXT: # kill: def $al killed $al killed $eax +; GISEL-X86-NEXT: retl %ret = call i8 @llvm.umin.i8(i8 %a, i8 %b) ret i8 %ret } @@ -57,25 +72,28 @@ define i16 @umin_i16(i16 %a, i16 %b) nounwind readnone { ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; -; FASTISEL-X64-LABEL: umin_i16: -; FASTISEL-X64: # %bb.0: -; FASTISEL-X64-NEXT: movl %esi, %eax -; FASTISEL-X64-NEXT: cmpw %ax, %di -; FASTISEL-X64-NEXT: cmovbl %edi, %eax -; FASTISEL-X64-NEXT: # kill: def $ax killed $ax killed $eax -; FASTISEL-X64-NEXT: retq +; GISEL-X64-LABEL: umin_i16: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movl %edi, %eax +; GISEL-X64-NEXT: xorl %ecx, %ecx +; GISEL-X64-NEXT: cmpw %si, %ax +; GISEL-X64-NEXT: setb %cl +; GISEL-X64-NEXT: andl $1, %ecx +; GISEL-X64-NEXT: cmovew %si, %ax +; GISEL-X64-NEXT: # kill: def $ax killed $ax killed $eax +; GISEL-X64-NEXT: retq ; -; X86-LABEL: umin_i16: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %cx, %ax -; X86-NEXT: jb .LBB1_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB1_2: -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: retl +; DAG-X86-LABEL: umin_i16: +; DAG-X86: # %bb.0: +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; DAG-X86-NEXT: cmpw %cx, %ax +; DAG-X86-NEXT: jb .LBB1_2 +; DAG-X86-NEXT: # %bb.1: +; DAG-X86-NEXT: movl %ecx, %eax +; DAG-X86-NEXT: .LBB1_2: +; DAG-X86-NEXT: # kill: def $ax killed $ax killed $eax +; DAG-X86-NEXT: retl ; ; FASTISEL-X86-LABEL: umin_i16: ; FASTISEL-X86: # %bb.0: @@ -88,6 +106,21 @@ define i16 @umin_i16(i16 %a, i16 %b) nounwind readnone { ; FASTISEL-X86-NEXT: .LBB1_2: ; FASTISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax ; FASTISEL-X86-NEXT: retl +; +; GISEL-X86-LABEL: umin_i16: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: xorl %edx, %edx +; GISEL-X86-NEXT: cmpw %ax, %cx +; GISEL-X86-NEXT: setb %dl +; GISEL-X86-NEXT: andl $1, %edx +; GISEL-X86-NEXT: je .LBB1_2 +; GISEL-X86-NEXT: # %bb.1: +; GISEL-X86-NEXT: movl %ecx, %eax +; GISEL-X86-NEXT: .LBB1_2: +; GISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax +; GISEL-X86-NEXT: retl %ret = call i16 @llvm.umin.i16(i16 %a, i16 %b) ret i16 %ret } @@ -99,12 +132,15 @@ define i32 @umin_i32(i32 %a, i32 %b) nounwind readnone { ; X64-NEXT: cmovbl %edi, %eax ; X64-NEXT: retq ; -; FASTISEL-X64-LABEL: umin_i32: -; FASTISEL-X64: # %bb.0: -; FASTISEL-X64-NEXT: movl %esi, %eax -; FASTISEL-X64-NEXT: cmpl %esi, %edi -; FASTISEL-X64-NEXT: cmovbl %edi, %eax -; FASTISEL-X64-NEXT: retq +; GISEL-X64-LABEL: umin_i32: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movl %edi, %eax +; GISEL-X64-NEXT: xorl %ecx, %ecx +; GISEL-X64-NEXT: cmpl %esi, %edi +; GISEL-X64-NEXT: setb %cl +; GISEL-X64-NEXT: andl $1, %ecx +; GISEL-X64-NEXT: cmovel %esi, %eax +; GISEL-X64-NEXT: retq ; ; X86-LABEL: umin_i32: ; X86: # %bb.0: @@ -117,16 +153,19 @@ define i32 @umin_i32(i32 %a, i32 %b) nounwind readnone { ; X86-NEXT: .LBB2_2: ; X86-NEXT: retl ; -; FASTISEL-X86-LABEL: umin_i32: -; FASTISEL-X86: # %bb.0: -; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; FASTISEL-X86-NEXT: cmpl %ecx, %eax -; FASTISEL-X86-NEXT: jb .LBB2_2 -; FASTISEL-X86-NEXT: # %bb.1: -; FASTISEL-X86-NEXT: movl %ecx, %eax -; FASTISEL-X86-NEXT: .LBB2_2: -; FASTISEL-X86-NEXT: retl +; GISEL-X86-LABEL: umin_i32: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: xorl %edx, %edx +; GISEL-X86-NEXT: cmpl %eax, %ecx +; GISEL-X86-NEXT: setb %dl +; GISEL-X86-NEXT: andl $1, %edx +; GISEL-X86-NEXT: je .LBB2_2 +; GISEL-X86-NEXT: # %bb.1: +; GISEL-X86-NEXT: movl %ecx, %eax +; GISEL-X86-NEXT: .LBB2_2: +; GISEL-X86-NEXT: retl %ret = call i32 @llvm.umin.i32(i32 %a, i32 %b) ret i32 %ret } @@ -138,32 +177,35 @@ define i64 @umin_i64(i64 %a, i64 %b) nounwind readnone { ; X64-NEXT: cmovbq %rdi, %rax ; X64-NEXT: retq ; -; FASTISEL-X64-LABEL: umin_i64: -; FASTISEL-X64: # %bb.0: -; FASTISEL-X64-NEXT: movq %rsi, %rax -; FASTISEL-X64-NEXT: cmpq %rsi, %rdi -; FASTISEL-X64-NEXT: cmovbq %rdi, %rax -; FASTISEL-X64-NEXT: retq +; GISEL-X64-LABEL: umin_i64: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movq %rdi, %rax +; GISEL-X64-NEXT: xorl %ecx, %ecx +; GISEL-X64-NEXT: cmpq %rsi, %rdi +; GISEL-X64-NEXT: setb %cl +; GISEL-X64-NEXT: andl $1, %ecx +; GISEL-X64-NEXT: cmoveq %rsi, %rax +; GISEL-X64-NEXT: retq ; -; X86-LABEL: umin_i64: -; X86: # %bb.0: -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: movl %edx, %edi -; X86-NEXT: sbbl %esi, %edi -; X86-NEXT: jb .LBB3_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %esi, %edx -; X86-NEXT: .LBB3_2: -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: retl +; DAG-X86-LABEL: umin_i64: +; DAG-X86: # %bb.0: +; DAG-X86-NEXT: pushl %edi +; DAG-X86-NEXT: pushl %esi +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; DAG-X86-NEXT: cmpl %ecx, %eax +; DAG-X86-NEXT: movl %edx, %edi +; DAG-X86-NEXT: sbbl %esi, %edi +; DAG-X86-NEXT: jb .LBB3_2 +; DAG-X86-NEXT: # %bb.1: +; DAG-X86-NEXT: movl %ecx, %eax +; DAG-X86-NEXT: movl %esi, %edx +; DAG-X86-NEXT: .LBB3_2: +; DAG-X86-NEXT: popl %esi +; DAG-X86-NEXT: popl %edi +; DAG-X86-NEXT: retl ; ; FASTISEL-X86-LABEL: umin_i64: ; FASTISEL-X86: # %bb.0: @@ -184,6 +226,44 @@ define i64 @umin_i64(i64 %a, i64 %b) nounwind readnone { ; FASTISEL-X86-NEXT: popl %esi ; FASTISEL-X86-NEXT: popl %edi ; FASTISEL-X86-NEXT: retl +; +; GISEL-X86-LABEL: umin_i64: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: pushl %ebp +; GISEL-X86-NEXT: pushl %ebx +; GISEL-X86-NEXT: pushl %edi +; GISEL-X86-NEXT: pushl %esi +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; GISEL-X86-NEXT: cmpl %eax, %esi +; GISEL-X86-NEXT: setb %bl +; GISEL-X86-NEXT: xorl %ecx, %ecx +; GISEL-X86-NEXT: cmpl %edx, %ebp +; GISEL-X86-NEXT: setb %bh +; GISEL-X86-NEXT: sete %cl +; GISEL-X86-NEXT: testl %ecx, %ecx +; GISEL-X86-NEXT: je .LBB3_2 +; GISEL-X86-NEXT: # %bb.1: +; GISEL-X86-NEXT: movb %bl, %bh +; GISEL-X86-NEXT: .LBB3_2: +; GISEL-X86-NEXT: movzbl %bh, %edi +; GISEL-X86-NEXT: andl $1, %edi +; GISEL-X86-NEXT: je .LBB3_4 +; GISEL-X86-NEXT: # %bb.3: +; GISEL-X86-NEXT: movl %esi, %eax +; GISEL-X86-NEXT: .LBB3_4: +; GISEL-X86-NEXT: testl %edi, %edi +; GISEL-X86-NEXT: je .LBB3_6 +; GISEL-X86-NEXT: # %bb.5: +; GISEL-X86-NEXT: movl %ebp, %edx +; GISEL-X86-NEXT: .LBB3_6: +; GISEL-X86-NEXT: popl %esi +; GISEL-X86-NEXT: popl %edi +; GISEL-X86-NEXT: popl %ebx +; GISEL-X86-NEXT: popl %ebp +; GISEL-X86-NEXT: retl %ret = call i64 @llvm.umin.i64(i64 %a, i64 %b) ret i64 %ret } diff --git a/llvm/test/CodeGen/X86/pr161693.ll b/llvm/test/CodeGen/X86/pr161693.ll new file mode 100644 index 0000000..de8188f --- /dev/null +++ b/llvm/test/CodeGen/X86/pr161693.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s + +define void @PR161693() #0 { +; CHECK-LABEL: PR161693: +; CHECK: # %bb.0: # %start +; CHECK-NEXT: movzbl (%rax), %eax +; CHECK-NEXT: andb $-33, %al +; CHECK-NEXT: addb $-71, %al +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_1: # %loop +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: cmpb $-6, %al +; CHECK-NEXT: setb %cl +; CHECK-NEXT: leal (%rcx,%rcx), %edx +; CHECK-NEXT: orb %cl, %dl +; CHECK-NEXT: leal (,%rdx,4), %ecx +; CHECK-NEXT: orb %dl, %cl +; CHECK-NEXT: je .LBB0_1 +; CHECK-NEXT: # %bb.2: # %exit +; CHECK-NEXT: retq +start: + br label %loop + +loop: + %.val.i.i89 = load <16 x i8>, ptr poison, align 1 + %.not49.i = icmp ult <16 x i8> zeroinitializer, splat (i8 -10) + %i = and <16 x i8> %.val.i.i89, splat (i8 -33) + %i1 = add <16 x i8> %i, splat (i8 -71) + %.not51.i = icmp ult <16 x i8> %i1, splat (i8 -6) + %.not46.i = and <16 x i1> %.not49.i, %.not51.i + %i2 = bitcast <16 x i1> %.not46.i to i16 + %_0.i = icmp eq i16 %i2, 0 + br i1 %_0.i, label %loop, label %exit + +exit: + ret void +} + +attributes #0 = { "target-features"="+soft-float" } diff --git a/llvm/test/DebugInfo/symbolize-build-id.test b/llvm/test/DebugInfo/symbolize-build-id.test index d63f43f..2620718 100644 --- a/llvm/test/DebugInfo/symbolize-build-id.test +++ b/llvm/test/DebugInfo/symbolize-build-id.test @@ -21,6 +21,7 @@ Sections: Type: SHT_NOTE Flags: [ SHF_ALLOC ] Content: 040000000800000003000000474e5500abb50d82b6bdc861 + AddressAlign: 4 ProgramHeaders: - Type: PT_NOTE Flags: [ PF_R ] diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s index 9296f04..ed76a28 100644 --- a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s +++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s @@ -22,7 +22,7 @@ # CHECK-OBJ: Contents of section .rodata: # CHECK-OBJ: 0000 48310048 32004833 00 H1.H2.H3. -# CHECK-LG: Starting link phase 1 for graph +# CHECK-LG: Starting link phase 1 # CHECK-LG: section .rodata: # CHECK-LG: block 0x0 size = 0x00000009, align = 1, alignment-offset = 0 diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-0.s b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-0.s new file mode 100644 index 0000000..557e403 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-0.s @@ -0,0 +1,7 @@ + .section __DATA,__data + .globl x + .p2align 2, 0x0 +x: + .long 0 + +.subsections_via_symbols diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-1.s b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-1.s new file mode 100644 index 0000000..711c8a0 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-1.s @@ -0,0 +1,7 @@ + .section __DATA,__data + .globl x + .p2align 2, 0x0 +x: + .long 1 + +.subsections_via_symbols diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_universal_slice_selection.s b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_universal_slice_selection.s new file mode 100644 index 0000000..c58f84e --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_universal_slice_selection.s @@ -0,0 +1,32 @@ +# RUN: rm -rf %t && mkdir -p %t +# RUN: llvm-mc -triple=arm64e-apple-darwin -filetype=obj -o %t/main.o %s +# RUN: llvm-mc -triple=arm64-apple-darwin -filetype=obj -o %t/x.arm64.o \ +# RUN: %S/Inputs/x-1.s +# RUN: llvm-ar crs %t/libX.arm64.a %t/x.arm64.o +# RUN: llvm-mc -triple=arm64e-apple-darwin -filetype=obj -o %t/x.arm64e.o \ +# RUN: %S/Inputs/x-0.s +# RUN: llvm-ar crs %t/libX.arm64e.a %t/x.arm64e.o +# RUN: llvm-lipo --create --output %t/libX.a %t/libX.arm64.a %t/libX.arm64e.a +# RUN: llvm-jitlink -noexec -check=%s %t/main.o -L%t -lX +# +# Create a universal archive with two slices (arm64e, arm64) each containing +# a definition of X: in arm64e X = 0, in arm64 X = 1. +# Check that if we load an arm64e object file then we link the arm64e slice +# of the archive by verifying that X = 0. +# + +# jitlink-check: *{4}x = 0 + + .section __TEXT,__text,regular,pure_instructions + .globl _main + .p2align 2 +_main: + mov w0, #0 + ret + + .section __DATA,__data + .globl p +p: + .quad x + +.subsections_via_symbols diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s index 2b5c9e3..5f6babf 100644 --- a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s +++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s @@ -102,7 +102,7 @@ p: call o .size p, .-p -# CHECK: Link graph "{{.*}}" before copy-and-fixup: +# CHECK: Link graph before copy-and-fixup: # CHECK: section .text: # CHECK: block 0x1000 # CHECK: symbols: diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s index 3bbfd55..c31250b 100644 --- a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s +++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s @@ -131,7 +131,7 @@ p: call o .size p, .-p -# CHECK: Link graph "{{.*}}" before copy-and-fixup: +# CHECK: Link graph before copy-and-fixup: # CHECK: section .text: # CHECK: block 0x1000 # CHECK: symbols: diff --git a/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll b/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll new file mode 100644 index 0000000..512ea37 --- /dev/null +++ b/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=gvn -S -enable-gvn-memdep=true < %s | FileCheck %s +; RUN: opt -passes=gvn -S -enable-gvn-memdep=false < %s | FileCheck %s --check-prefix=MEMDEPFALSE + +define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) { +; CHECK-LABEL: @forward_binop_with_sel( +; CHECK-NEXT: [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]]) +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]]) +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]] +; CHECK-NEXT: ret <4 x float> [[TMP3]] +; +; MEMDEPFALSE-LABEL: @forward_binop_with_sel( +; MEMDEPFALSE-NEXT: [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]]) +; MEMDEPFALSE-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; MEMDEPFALSE-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; MEMDEPFALSE-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; MEMDEPFALSE-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]] +; MEMDEPFALSE-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]]) +; MEMDEPFALSE-NEXT: [[LOAD_1_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 1, <4 x i1> [[MASK]], <4 x float> [[PASSTHROUGH:%.*]]) +; MEMDEPFALSE-NEXT: ret <4 x float> [[LOAD_1_0]] +; + %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %a, i32 %b) + %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %fmul = fmul <4 x float> %load.0.0, %load.0.16 + call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask) + %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> %passthrough) + ret <4 x float> %load.1.0 +} diff --git a/llvm/test/Transforms/GVN/masked-load-store.ll b/llvm/test/Transforms/GVN/masked-load-store.ll index 984a756..b112e99 100644 --- a/llvm/test/Transforms/GVN/masked-load-store.ll +++ b/llvm/test/Transforms/GVN/masked-load-store.ll @@ -36,6 +36,180 @@ define <128 x i8> @f1(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) { ret <128 x i8> %v4 } -declare <128 x i8> @llvm.masked.load.v128i8.p0(ptr, i32, <128 x i1>, <128 x i8>) -declare void @llvm.masked.store.v128i8.p0(<128 x i8>, ptr, i32, <128 x i1>) +define <4 x float> @forward_masked_load(ptr %0, ptr %1) { +; CHECK-LABEL: @forward_masked_load( +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true)) +; CHECK-NEXT: ret <4 x float> [[TMP4]] +; + %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4) + %load1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + call void @llvm.masked.store.v4f32.p0(<4 x float> %load1, ptr %1, i32 1, <4 x i1> %mask) + %load2 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + ret <4 x float> %load2 +} + +define <4 x float> @forward_masked_load_arbitrary_mask(ptr %loc_a, ptr %loc_b, <4 x i1> %mask) { +; CHECK-LABEL: @forward_masked_load_arbitrary_mask( +; CHECK-NEXT: [[LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[LOC_A:%.*]], i32 1, <4 x i1> [[MASK:%.*]], <4 x float> zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD1]], ptr [[LOC_B:%.*]], i32 1, <4 x i1> [[MASK]]) +; CHECK-NEXT: [[TMP1:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[LOAD1]], <4 x float> zeroinitializer +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; + %load1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %loc_a, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + call void @llvm.masked.store.v4f32.p0(<4 x float> %load1, ptr %loc_b, i32 1, <4 x i1> %mask) + %load2 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %loc_b, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + ret <4 x float> %load2 +} + +define <4 x float> @forward_binop_splat_i1_mask(ptr %0, ptr %1) { +; CHECK-LABEL: @forward_binop_splat_i1_mask( +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true)) +; CHECK-NEXT: ret <4 x float> [[FMUL]] +; + %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4) + %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %fmul = fmul <4 x float> %load.0.0, %load.0.16 + call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask) + %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + ret <4 x float> %load.1.0 +} + +define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) { +; CHECK-LABEL: @forward_binop_with_sel( +; CHECK-NEXT: [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]]) +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]]) +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]] +; CHECK-NEXT: ret <4 x float> [[TMP3]] +; + %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %a, i32 %b) + %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %fmul = fmul <4 x float> %load.0.0, %load.0.16 + call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask) + %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> %passthrough) + ret <4 x float> %load.1.0 +} + +define <vscale x 4 x float> @forward_masked_load_scalable(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) { +; CHECK-LABEL: @forward_masked_load_scalable( +; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[PASSTHROUGH]] +; CHECK-NEXT: ret <vscale x 4 x float> [[TMP5]] +; + %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough) + call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load1, ptr %1, i32 1, <vscale x 4 x i1> %mask) + %load2 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough) + ret <vscale x 4 x float> %load2 +} +define <vscale x 4 x float> @forward_masked_load_scalable_type_mismatch(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) { +; CHECK-LABEL: @forward_masked_load_scalable_type_mismatch( +; CHECK-NEXT: [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[LOAD1:%.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x double> zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> [[LOAD1]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK]]) +; CHECK-NEXT: [[LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD2]] +; + %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load1 = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x double> zeroinitializer) + call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> %load1, ptr %1, i32 1, <vscale x 4 x i1> %mask) + %load2 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough) + ret <vscale x 4 x float> %load2 +} + +define <vscale x 4 x float> @generate_sel_with_passthrough(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) { +; CHECK-LABEL: @generate_sel_with_passthrough( +; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[PASSTHROUGH:%.*]] +; CHECK-NEXT: ret <vscale x 4 x float> [[TMP5]] +; + %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer) + call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load1, ptr %1, i32 1, <vscale x 4 x i1> %mask) + %load2 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough) + ret <vscale x 4 x float> %load2 +} + +define <vscale x 4 x float> @forward_binop_with_sel_scalable(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) { +; CHECK-LABEL: @forward_binop_with_sel_scalable( +; CHECK-NEXT: [[MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK]]) +; CHECK-NEXT: [[TMP3:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[FMUL]], <vscale x 4 x float> [[PASSTHROUGH:%.*]] +; CHECK-NEXT: ret <vscale x 4 x float> [[TMP3]] +; + %mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer) + %fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16 + call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask) + %load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough) + ret <vscale x 4 x float> %load.1.0 +} + +define <vscale x 4 x float> @load_mask_differs(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) { +; CHECK-LABEL: @load_mask_differs( +; CHECK-NEXT: [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8) +; CHECK-NEXT: [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]]) +; CHECK-NEXT: [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK1]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD_1_0]] +; + %mask0 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8) + %mask1 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer) + %fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16 + call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask0) + %load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask1, <vscale x 4 x float> %passthrough) + ret <vscale x 4 x float> %load.1.0 +} + +define <vscale x 4 x float> @store_mask_differs(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) { +; CHECK-LABEL: @store_mask_differs( +; CHECK-NEXT: [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8) +; CHECK-NEXT: [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK1]]) +; CHECK-NEXT: [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD_1_0]] +; + %mask0 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8) + %mask1 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer) + %fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16 + call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask1) + %load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> %passthrough) + ret <vscale x 4 x float> %load.1.0 +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index c3b0bc8..27ca414 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -86,7 +86,7 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 { ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 16: 48 +; CHECK: Cost for VF 16: 41 ; CHECK: LV: Selecting VF: 16 entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll index 229209e..5ae0839 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -204,37 +204,33 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]] -; CHECK-SVE-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]] -; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP19]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]]) +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -670,39 +666,35 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]] -; CHECK-SVE-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]] -; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]] -; CHECK-SVE-NEXT: [[TMP20:%.*]] = mul nsw <vscale x 4 x i32> [[TMP14]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP21]] = add <vscale x 4 x i32> [[TMP19]], [[TMP20]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]]) +; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -996,36 +988,32 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]] -; CHECK-SVE-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]] -; CHECK-SVE-NEXT: [[TMP18]] = add <vscale x 4 x i32> [[TMP17]], [[TMP15]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP5]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP18]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -1140,32 +1128,28 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 { ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP12:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP11]] -; CHECK-SVE-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[TMP12]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[TMP2:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]]) +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP3]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]]) +; CHECK-SVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE2]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -1277,36 +1261,32 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP16:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]] -; CHECK-SVE-NEXT: [[TMP18]] = add <vscale x 4 x i32> [[TMP16]], [[TMP17]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) +; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP18]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index dd239c0..8ece59a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -81,7 +81,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll index 49e9989..09b41fb 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll @@ -12,40 +12,40 @@ define i32 @sudot(ptr %a, ptr %b) #0 { ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 4 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1 -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3 +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1 -; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]] -; CHECK-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]] -; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]]) -; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]]) +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP19:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP7]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP14]]) +; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP19]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: @@ -62,8 +62,8 @@ define i32 @sudot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: ; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] ; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 @@ -82,14 +82,14 @@ define i32 @sudot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32> ; CHECK-NOI8MM-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]] ; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]] -; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]]) -; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]]) +; CHECK-NOI8MM-NEXT: [[TMP22]] = add <vscale x 8 x i32> [[TMP20]], [[VEC_PHI]] +; CHECK-NOI8MM-NEXT: [[TMP23]] = add <vscale x 8 x i32> [[TMP21]], [[VEC_PHI1]] ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-NOI8MM: middle.block: -; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-NOI8MM-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP22]] +; CHECK-NOI8MM-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]]) ; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-NOI8MM: scalar.ph: @@ -123,40 +123,40 @@ define i32 @usdot(ptr %a, ptr %b) #0 { ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 4 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1 -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3 +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1 -; CHECK-NEXT: [[TMP18:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]] -; CHECK-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]] -; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]]) -; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]]) +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP19:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP7]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP14]]) +; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP19]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: @@ -173,8 +173,8 @@ define i32 @usdot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: ; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] ; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 @@ -193,14 +193,14 @@ define i32 @usdot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32> ; CHECK-NOI8MM-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]] ; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]] -; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]]) -; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]]) +; CHECK-NOI8MM-NEXT: [[TMP22]] = add <vscale x 8 x i32> [[TMP20]], [[VEC_PHI]] +; CHECK-NOI8MM-NEXT: [[TMP23]] = add <vscale x 8 x i32> [[TMP21]], [[VEC_PHI1]] ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-NOI8MM: middle.block: -; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-NOI8MM-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP22]] +; CHECK-NOI8MM-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]]) ; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-NOI8MM: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 6e11e55..3a88273 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -12,74 +12,62 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP16]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP20]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP18]], [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: ret i32 [[TMP6]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @dotp( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]] -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP14]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP20]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP20]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = shl nuw i64 [[TMP26]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP28]], i64 [[TMP27]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP28]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP29]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP8]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP9]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: ret i32 [[TMP11]] ; ; CHECK-MAXBW-LABEL: define i32 @dotp( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { @@ -139,78 +127,52 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP9]], 2 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64> -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP13]], [[TMP11]] -; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add <vscale x 2 x i64> [[TMP14]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP15]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: ret i64 [[TMP4]] ; ; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_i8_to_i64_has_neon_dotprod( ; CHECK-INTERLEAVED-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP9]], 4 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP16]] -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP13]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i64 [[TMP18]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP1]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD5]] to <vscale x 2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP20]], [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP21]], [[TMP15]] -; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 2 x i64> [[TMP22]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add <vscale x 2 x i64> [[TMP23]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]] -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP10]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: ret i64 [[TMP4]] ; ; CHECK-MAXBW-LABEL: define i64 @not_dotp_i8_to_i64_has_neon_dotprod( ; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] { @@ -274,86 +236,66 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly % ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = mul i64 [[TMP10]], 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP20]] -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = mul i64 [[TMP10]], 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]] ; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD]] to <vscale x 2 x i64> -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP2]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD3]] to <vscale x 2 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP15]], [[TMP13]] -; CHECK-INTERLEAVE1-NEXT: [[TMP17]] = add <vscale x 2 x i64> [[TMP16]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[NEXT_GEP2]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP1]], [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP2]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP17]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: ret i64 [[TMP4]] ; ; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_i16_to_i64_has_neon_dotprod( ; CHECK-INTERLEAVED-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP11]] -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP15]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP15]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]] ; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX2]] -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shl nuw i64 [[TMP13]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP]], align 2 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i16>, ptr [[TMP30]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD]] to <vscale x 2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD4]] to <vscale x 2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i64 [[TMP20]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP3]], align 2 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 2 x i16>, ptr [[TMP21]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD5]] to <vscale x 2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD6]] to <vscale x 2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP22]], [[TMP16]] -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP23]], [[TMP17]] -; CHECK-INTERLEAVED-NEXT: [[TMP26]] = add <vscale x 2 x i64> [[TMP24]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP27]] = add <vscale x 2 x i64> [[TMP25]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP15]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[WIDE_LOAD4]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[NEXT_GEP3]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD5]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD6]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP6]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP7]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP27]], [[TMP26]] -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP15]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: ret i64 [[TMP9]] ; ; CHECK-MAXBW-LABEL: define i64 @not_dotp_i16_to_i64_has_neon_dotprod( ; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1]] { @@ -497,7 +439,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) ; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] @@ -656,7 +598,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]] ; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) @@ -803,7 +745,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8 @@ -851,7 +793,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nuw i32 [[TMP29]], 8 @@ -952,7 +894,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8 @@ -990,7 +932,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 8 @@ -1058,22 +1000,18 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled( ; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP13]], 2 -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16 ; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP16]], 4 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]] +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16 ; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 1 @@ -1085,38 +1023,38 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 3 ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = mul nsw <vscale x 4 x i32> [[TMP21]], [[TMP36]] -; CHECK-INTERLEAVE1-NEXT: [[TMP23]] = add <vscale x 4 x i32> [[TMP38]], [[VEC_PHI3]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD6]] to <vscale x 4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = mul nsw <vscale x 4 x i32> [[TMP25]], [[TMP42]] -; CHECK-INTERLEAVE1-NEXT: [[TMP30]] = add <vscale x 4 x i32> [[TMP28]], [[VEC_PHI2]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD7]] to <vscale x 4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = mul nsw <vscale x 4 x i32> [[TMP31]], [[TMP33]] -; CHECK-INTERLEAVE1-NEXT: [[TMP35]] = add <vscale x 4 x i32> [[TMP34]], [[VEC_PHI1]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = mul nsw <vscale x 4 x i32> [[TMP37]], [[TMP39]] -; CHECK-INTERLEAVE1-NEXT: [[TMP41]] = add <vscale x 4 x i32> [[TMP40]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP12]], [[TMP23]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]]) +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP16]]) +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP18]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP19]]) +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP21]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP22]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP41]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP35]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP30]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP23]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: @@ -1124,26 +1062,22 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled( ; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP13]], 3 -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 32 ; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP16]], 8 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 32 ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE28:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE29:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE22:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 1 @@ -1155,90 +1089,74 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 3 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP56]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP20]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = shl nuw i64 [[TMP25]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 [[TMP26]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP72]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nsw <vscale x 4 x i32> [[TMP28]], [[TMP66]] -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <vscale x 4 x i32> [[TMP82]], [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[TMP50]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI6]] -; CHECK-INTERLEAVED-NEXT: [[TMP33]] = add <vscale x 4 x i32> [[TMP31]], [[VEC_PHI7]] -; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = shl nuw i64 [[TMP35]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP36]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <vscale x 4 x i8>, ptr [[TMP37]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD11]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD12]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = shl nuw i64 [[TMP41]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP42]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <vscale x 4 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <vscale x 4 x i8>, ptr [[TMP43]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD13]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD14]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = mul nsw <vscale x 4 x i32> [[TMP38]], [[TMP44]] -; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = mul nsw <vscale x 4 x i32> [[TMP39]], [[TMP45]] -; CHECK-INTERLEAVED-NEXT: [[TMP48]] = add <vscale x 4 x i32> [[TMP46]], [[VEC_PHI4]] -; CHECK-INTERLEAVED-NEXT: [[TMP49]] = add <vscale x 4 x i32> [[TMP47]], [[VEC_PHI5]] -; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = shl nuw i64 [[TMP51]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP52]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD16:%.*]] = load <vscale x 4 x i8>, ptr [[TMP53]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD15]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD16]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = shl nuw i64 [[TMP57]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP58]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD17:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <vscale x 4 x i8>, ptr [[TMP59]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD17]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD18]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = mul nsw <vscale x 4 x i32> [[TMP54]], [[TMP60]] -; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = mul nsw <vscale x 4 x i32> [[TMP55]], [[TMP61]] -; CHECK-INTERLEAVED-NEXT: [[TMP64]] = add <vscale x 4 x i32> [[TMP62]], [[VEC_PHI2]] -; CHECK-INTERLEAVED-NEXT: [[TMP65]] = add <vscale x 4 x i32> [[TMP63]], [[VEC_PHI3]] -; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = shl nuw i64 [[TMP67]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP68]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <vscale x 4 x i8>, ptr [[TMP69]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD19]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD20]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = shl nuw i64 [[TMP73]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP74]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD22:%.*]] = load <vscale x 4 x i8>, ptr [[TMP75]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD21]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD22]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = mul nsw <vscale x 4 x i32> [[TMP70]], [[TMP76]] -; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = mul nsw <vscale x 4 x i32> [[TMP71]], [[TMP77]] -; CHECK-INTERLEAVED-NEXT: [[TMP80]] = add <vscale x 4 x i32> [[TMP78]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP81]] = add <vscale x 4 x i32> [[TMP79]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP17]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP18]]) +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP24]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP25]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP26]]) +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP30]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP31]] +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP48]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE22]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP33]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE23]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP34]]) +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = mul nsw <16 x i32> [[TMP36]], [[TMP39]] +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = mul nsw <16 x i32> [[TMP37]], [[TMP40]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE28]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP41]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE29]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP42]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP81]], [[TMP80]] -; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX23:%.*]] = add <vscale x 4 x i32> [[TMP65]], [[TMP64]] -; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX23]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX24:%.*]] = add <vscale x 4 x i32> [[TMP49]], [[TMP48]] -; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX24]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX25:%.*]] = add <vscale x 4 x i32> [[TMP33]], [[TMP50]] -; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX25]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE29]], [[PARTIAL_REDUCE28]] +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX30:%.*]] = add <4 x i32> [[PARTIAL_REDUCE23]], [[PARTIAL_REDUCE22]] +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX30]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX31:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]] +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX31]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX32]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: @@ -1396,7 +1314,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]]) ; CHECK-INTERLEAVE1-NEXT: br label [[EXIT:%.*]] @@ -1434,7 +1352,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]]) ; CHECK-INTERLEAVED-NEXT: br label [[EXIT:%.*]] @@ -1525,7 +1443,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] ; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() @@ -1572,7 +1490,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] ; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]] ; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]]) @@ -1607,7 +1525,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP24]]) ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() @@ -1666,7 +1584,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP14]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP15]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] @@ -1713,7 +1631,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add <vscale x 2 x i64> [[VEC_PHI1]], [[TMP23]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]] ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]]) @@ -1748,7 +1666,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP14]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP13]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP14]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] @@ -1866,7 +1784,7 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add i32 [[TMP21]], [[TMP15]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP22]] ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -1978,7 +1896,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2016,7 +1934,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) @@ -2053,7 +1971,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2111,7 +2029,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2149,7 +2067,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) @@ -2186,7 +2104,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2226,36 +2144,32 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] ; CHECK-INTERLEAVE1: for.body.preheader: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 ; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 ; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[COST]], i32 0 ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64> -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP16]], [[TMP14]] -; CHECK-INTERLEAVE1-NEXT: [[TMP18]] = add <vscale x 2 x i64> [[TMP17]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP18]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: @@ -2267,50 +2181,32 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] ; CHECK-INTERLEAVED: for.body.preheader: ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 ; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[COST]], i32 0 ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP15]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = shl nuw i64 [[TMP20]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 [[TMP21]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i8>, ptr [[TMP22]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD5]] to <vscale x 2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP23]], [[TMP17]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP24]], [[TMP18]] -; CHECK-INTERLEAVED-NEXT: [[TMP27]] = add <vscale x 2 x i64> [[TMP25]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP28]] = add <vscale x 2 x i64> [[TMP26]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP28]], [[TMP27]] -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: @@ -2349,7 +2245,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-MAXBW-NEXT: [[TMP20]] = add <vscale x 8 x i64> [[TMP17]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP20]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -2471,7 +2367,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVE1-NEXT: [[TMP36]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP36]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP33]]) @@ -2571,7 +2467,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) ; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) @@ -2671,7 +2567,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) ; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll index 11ff688..7bb4715 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll @@ -12,77 +12,65 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sub <vscale x 4 x i32> zeroinitializer, [[TMP13]] -; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP14]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = sub <16 x i32> zeroinitializer, [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP15]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: ret i32 [[TMP8]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @dotp( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP9]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP17]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP20]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sub <vscale x 4 x i32> zeroinitializer, [[TMP21]] -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sub <vscale x 4 x i32> zeroinitializer, [[TMP22]] -; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[TMP26]] = add <vscale x 4 x i32> [[VEC_PHI1]], [[TMP24]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP14]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP15]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sub <16 x i32> zeroinitializer, [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP26]], [[TMP25]] -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: ret i32 [[TMP13]] ; ; CHECK-MAXBW-LABEL: define i32 @dotp( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll index db3166c..3c2ae1c7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll @@ -17,16 +17,16 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 { ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP3]] = add <16 x i32> [[TMP2]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: ; @@ -38,22 +38,22 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP5]] = add <16 x i32> [[TMP3]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <16 x i32> [[TMP4]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP6]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVED: scalar.ph: ; @@ -199,16 +199,16 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 { ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = add <16 x i64> [[TMP3]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP3]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP4]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: ; @@ -220,22 +220,22 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <16 x i64> [[TMP4]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <16 x i64> [[TMP5]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP5]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i64> [[TMP7]], [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVED: scalar.ph: ; @@ -293,16 +293,16 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 { ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = add <8 x i64> [[TMP3]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP4]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: ; @@ -314,22 +314,22 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[TMP1]], i32 8 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <8 x i64> [[TMP5]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP4]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP5]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP7]], [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVED: scalar.ph: ; @@ -764,16 +764,16 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 { ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = add <16 x i32> [[TMP3]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: ; @@ -785,22 +785,22 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <16 x i32> [[TMP4]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <16 x i32> [[TMP5]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVED: scalar.ph: ; @@ -984,21 +984,21 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 { ; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 ; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> zeroinitializer, i32 [[A]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0 ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]] ; CHECK-INTERLEAVE1-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP6]] = add <16 x i32> [[VEC_PHI]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP6]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: @@ -1015,26 +1015,26 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 { ; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 ; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> zeroinitializer, i32 [[A]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP12]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]] ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 ; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1 ; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP6]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <16 x i32> [[VEC_PHI]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <16 x i32> [[VEC_PHI2]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP3]]) ; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add nuw i32 [[VEC_PHI1]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP22]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP8]], [[TMP7]] -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE2]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll index c61361b..25ee100 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll @@ -192,7 +192,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK: LV(REG): VF = 16 ; CHECK-NEXT: LV(REG): Found max usage: 2 item ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 9 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 24 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 12 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item entry: br label %for.body diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index e09ddb4..731d648 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -1636,7 +1636,11 @@ static std::pair<Triple, SubtargetFeatures> getFirstFileTripleAndFeatures() { case file_magic::macho_object: { auto Obj = ExitOnErr( object::ObjectFile::createObjectFile(ObjBuffer->getMemBufferRef())); - Triple TT = Obj->makeTriple(); + Triple TT; + if (auto *MachOObj = dyn_cast<object::MachOObjectFile>(Obj.get())) + TT = MachOObj->getArchTriple(); + else + TT = Obj->makeTriple(); if (Magic == file_magic::coff_object) { // TODO: Move this to makeTriple() if possible. TT.setObjectFormat(Triple::COFF); diff --git a/llvm/unittests/Object/BuildIDTest.cpp b/llvm/unittests/Object/BuildIDTest.cpp new file mode 100644 index 0000000..04ca636 --- /dev/null +++ b/llvm/unittests/Object/BuildIDTest.cpp @@ -0,0 +1,120 @@ +//===- BuildIDTest.cpp - Tests for getBuildID ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Object/BuildID.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/ObjectYAML/yaml2obj.h" +#include "llvm/Support/YAMLTraits.h" +#include "llvm/Testing/Support/Error.h" + +#include "gtest/gtest.h" + +using namespace llvm; +using namespace llvm::object; + +template <class ELFT> +static Expected<ELFObjectFile<ELFT>> toBinary(SmallVectorImpl<char> &Storage, + StringRef Yaml) { + raw_svector_ostream OS(Storage); + yaml::Input YIn(Yaml); + if (!yaml::convertYAML(YIn, OS, [](const Twine &Msg) {})) + return createStringError(std::errc::invalid_argument, + "unable to convert YAML"); + return ELFObjectFile<ELFT>::create(MemoryBufferRef(OS.str(), "dummyELF")); +} + +static StringRef getInvalidNoteELF(bool WithShdr) { + static std::string WithSection(R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +ProgramHeaders: + - Type: PT_NOTE + FileSize: 0x1a + FirstSec: .note.gnu.build-id + LastSec: .note.gnu.build-id +Sections: + - Name: .note.gnu.build-id + Type: SHT_NOTE + AddressAlign: 0x04 + Notes: + - Name: "GNU" + Desc: "abb50d82b6bdc861" + Type: 3 +)"); + static std::string WithoutSection(WithSection + R"( + - Type: SectionHeaderTable + NoHeaders: true +)"); + if (WithShdr) + return WithSection; + return WithoutSection; +} + +// The BuildID can be looked up from a section header, if there is no program +// header. +TEST(BuildIDTest, InvalidPhdrFileSizeWithShdrs) { + SmallString<0> Storage; + Expected<ELFObjectFile<ELF64LE>> ElfOrErr = + toBinary<ELF64LE>(Storage, getInvalidNoteELF(true)); + ASSERT_THAT_EXPECTED(ElfOrErr, Succeeded()); + BuildIDRef BuildID = getBuildID(&ElfOrErr.get()); + EXPECT_EQ( + StringRef(reinterpret_cast<const char *>(BuildID.data()), BuildID.size()), + "\xAB\xB5\x0D\x82\xB6\xBD\xC8\x61"); +} + +// The code handles a malformed program header that points at data outside the +// file. +TEST(BuildIDTest, InvalidPhdrFileSizeNoShdrs) { + SmallString<0> Storage; + Expected<ELFObjectFile<ELF64LE>> ElfOrErr = + toBinary<ELF64LE>(Storage, getInvalidNoteELF(false)); + ASSERT_THAT_EXPECTED(ElfOrErr, Succeeded()); + BuildIDRef BuildID = getBuildID(&ElfOrErr.get()); + EXPECT_EQ( + StringRef(reinterpret_cast<const char *>(BuildID.data()), BuildID.size()), + ""); +} + +// The code handles a malformed section header that points at data outside the +// file. +TEST(BuildIDTest, InvalidSectionHeader) { + SmallString<0> Storage; + Expected<ELFObjectFile<ELF64LE>> ElfOrErr = toBinary<ELF64LE>(Storage, R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +ProgramHeaders: + - Type: PT_NOTE + FirstSec: .note.gnu.build-id + LastSec: .note.gnu.build-id +Sections: + - Name: .note.gnu.build-id + Type: SHT_NOTE + AddressAlign: 0x04 + ShOffset: 0x1a1 + Notes: + - Name: "GNU" + Desc: "abb50d82b6bdc861" + Type: 3 +)"); + ASSERT_THAT_EXPECTED(ElfOrErr, Succeeded()); + BuildIDRef BuildID = getBuildID(&ElfOrErr.get()); + EXPECT_EQ( + StringRef(reinterpret_cast<const char *>(BuildID.data()), BuildID.size()), + "\xAB\xB5\x0D\x82\xB6\xBD\xC8\x61"); +} diff --git a/llvm/unittests/Object/CMakeLists.txt b/llvm/unittests/Object/CMakeLists.txt index 1343352..cd70a7b 100644 --- a/llvm/unittests/Object/CMakeLists.txt +++ b/llvm/unittests/Object/CMakeLists.txt @@ -7,6 +7,7 @@ set(LLVM_LINK_COMPONENTS add_llvm_unittest(ObjectTests ArchiveTest.cpp + BuildIDTest.cpp COFFObjectFileTest.cpp DXContainerTest.cpp ELFObjectFileTest.cpp diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index a943e7ac..b99d656 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -203,7 +203,7 @@ TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) { VPInstruction::BranchOnCond, {Plan->getOrAddLiveIn(ConstantInt::getTrue(F->getContext()))})); VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( - Plan, [](PHINode *P) { return nullptr; }, TLI); + *Plan, [](PHINode *P) { return nullptr; }, TLI); VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); EXPECT_EQ(0u, Entry->getNumPredecessors()); diff --git a/llvm/unittests/Transforms/Vectorize/VPlanUncountableExitTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanUncountableExitTest.cpp index eb075e6..b89d378 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanUncountableExitTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanUncountableExitTest.cpp @@ -48,7 +48,7 @@ TEST_F(VPUncountableExitTest, FindUncountableExitRecipes) { BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); auto Plan = buildVPlan(LoopHeader, /*HasUncountableExit=*/true); VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( - Plan, [](PHINode *P) { return nullptr; }, *TLI); + *Plan, [](PHINode *P) { return nullptr; }, *TLI); VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan); SmallVector<VPRecipeBase *> Recipes; @@ -85,7 +85,7 @@ TEST_F(VPUncountableExitTest, NoUncountableExit) { BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); auto Plan = buildVPlan(LoopHeader); VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( - Plan, [](PHINode *P) { return nullptr; }, *TLI); + *Plan, [](PHINode *P) { return nullptr; }, *TLI); VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan); SmallVector<VPRecipeBase *> Recipes; diff --git a/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn index b856d1c..764ebb9 100644 --- a/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn +++ b/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn @@ -28,6 +28,7 @@ static_library("Rewrite") { "BuildIDRewriter.cpp", "DWARFRewriter.cpp", "ExecutableFileMemoryManager.cpp", + "GNUPropertyRewriter.cpp", "JITLinkLinker.cpp", "LinuxKernelRewriter.cpp", "MachORewriteInstance.cpp", diff --git a/llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn index 9fcb05c..54193c8 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn @@ -10,6 +10,7 @@ unittest("ObjectTests") { ] sources = [ "ArchiveTest.cpp", + "BuildIDTest.cpp", "COFFObjectFileTest.cpp", "DXContainerTest.cpp", "ELFObjectFileTest.cpp", diff --git a/mlir/lib/CAPI/Transforms/Rewrite.cpp b/mlir/lib/CAPI/Transforms/Rewrite.cpp index 8ee6308..0d56259 100644 --- a/mlir/lib/CAPI/Transforms/Rewrite.cpp +++ b/mlir/lib/CAPI/Transforms/Rewrite.cpp @@ -259,22 +259,23 @@ void mlirIRRewriterDestroy(MlirRewriterBase rewriter) { /// RewritePatternSet and FrozenRewritePatternSet API //===----------------------------------------------------------------------===// -inline mlir::RewritePatternSet &unwrap(MlirRewritePatternSet module) { +static inline mlir::RewritePatternSet &unwrap(MlirRewritePatternSet module) { assert(module.ptr && "unexpected null module"); return *(static_cast<mlir::RewritePatternSet *>(module.ptr)); } -inline MlirRewritePatternSet wrap(mlir::RewritePatternSet *module) { +static inline MlirRewritePatternSet wrap(mlir::RewritePatternSet *module) { return {module}; } -inline mlir::FrozenRewritePatternSet * +static inline mlir::FrozenRewritePatternSet * unwrap(MlirFrozenRewritePatternSet module) { assert(module.ptr && "unexpected null module"); return static_cast<mlir::FrozenRewritePatternSet *>(module.ptr); } -inline MlirFrozenRewritePatternSet wrap(mlir::FrozenRewritePatternSet *module) { +static inline MlirFrozenRewritePatternSet +wrap(mlir::FrozenRewritePatternSet *module) { return {module}; } @@ -321,12 +322,12 @@ inline MlirPatternRewriter wrap(mlir::PatternRewriter *rewriter) { //===----------------------------------------------------------------------===// #if MLIR_ENABLE_PDL_IN_PATTERNMATCH -inline mlir::PDLPatternModule *unwrap(MlirPDLPatternModule module) { +static inline mlir::PDLPatternModule *unwrap(MlirPDLPatternModule module) { assert(module.ptr && "unexpected null module"); return static_cast<mlir::PDLPatternModule *>(module.ptr); } -inline MlirPDLPatternModule wrap(mlir::PDLPatternModule *module) { +static inline MlirPDLPatternModule wrap(mlir::PDLPatternModule *module) { return {module}; } diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp index 3bd763e..05fc7cb 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp @@ -1622,12 +1622,12 @@ static void generateCollapsedIndexingRegion( } } -void collapseOperandsAndResults(LinalgOp op, - const CollapsingInfo &collapsingInfo, - RewriterBase &rewriter, - SmallVectorImpl<Value> &inputOperands, - SmallVectorImpl<Value> &outputOperands, - SmallVectorImpl<Type> &resultTypes) { +static void collapseOperandsAndResults(LinalgOp op, + const CollapsingInfo &collapsingInfo, + RewriterBase &rewriter, + SmallVectorImpl<Value> &inputOperands, + SmallVectorImpl<Value> &outputOperands, + SmallVectorImpl<Type> &resultTypes) { Location loc = op->getLoc(); inputOperands = llvm::map_to_vector(op.getDpsInputOperands(), [&](OpOperand *opOperand) { @@ -1651,8 +1651,8 @@ void collapseOperandsAndResults(LinalgOp op, /// Clone a `LinalgOp` to a collapsed version of same name template <typename OpTy> -OpTy cloneToCollapsedOp(RewriterBase &rewriter, OpTy origOp, - const CollapsingInfo &collapsingInfo) { +static OpTy cloneToCollapsedOp(RewriterBase &rewriter, OpTy origOp, + const CollapsingInfo &collapsingInfo) { return nullptr; } @@ -1699,8 +1699,9 @@ GenericOp cloneToCollapsedOp<GenericOp>(RewriterBase &rewriter, return collapsedOp; } -LinalgOp createCollapsedOp(LinalgOp op, const CollapsingInfo &collapsingInfo, - RewriterBase &rewriter) { +static LinalgOp createCollapsedOp(LinalgOp op, + const CollapsingInfo &collapsingInfo, + RewriterBase &rewriter) { if (GenericOp genericOp = dyn_cast<GenericOp>(op.getOperation())) { return cloneToCollapsedOp(rewriter, genericOp, collapsingInfo); } else { diff --git a/mlir/tools/mlir-rewrite/mlir-rewrite.cpp b/mlir/tools/mlir-rewrite/mlir-rewrite.cpp index fd8ae7e..795766f 100644 --- a/mlir/tools/mlir-rewrite/mlir-rewrite.cpp +++ b/mlir/tools/mlir-rewrite/mlir-rewrite.cpp @@ -35,7 +35,7 @@ namespace mlir { using OperationDefinition = AsmParserState::OperationDefinition; /// Return the source code associated with the OperationDefinition. -SMRange getOpRange(const OperationDefinition &op) { +static SMRange getOpRange(const OperationDefinition &op) { const char *startOp = op.scopeLoc.Start.getPointer(); const char *endOp = op.scopeLoc.End.getPointer(); @@ -187,15 +187,15 @@ std::unique_ptr<RewritePad> RewritePad::init(StringRef inputFilename, } /// Return the source code associated with the operation name. -SMRange getOpNameRange(const OperationDefinition &op) { return op.loc; } +static SMRange getOpNameRange(const OperationDefinition &op) { return op.loc; } /// Return whether the operation was printed using generic syntax in original /// buffer. -bool isGeneric(const OperationDefinition &op) { +static bool isGeneric(const OperationDefinition &op) { return op.loc.Start.getPointer()[0] == '"'; } -inline int asMainReturnCode(LogicalResult r) { +static inline int asMainReturnCode(LogicalResult r) { return r.succeeded() ? EXIT_SUCCESS : EXIT_FAILURE; } @@ -293,7 +293,7 @@ static llvm::cl::opt<std::string> simpleRenameReplace{ llvm::cl::cat(clSimpleRenameCategory)}; // Rewriter that does simple renames. -LogicalResult simpleRename(RewritePad &rewriteState, raw_ostream &os) { +static LogicalResult simpleRename(RewritePad &rewriteState, raw_ostream &os) { StringRef opName = simpleRenameOpName; StringRef match = simpleRenameMatch; StringRef replace = simpleRenameReplace; @@ -317,7 +317,7 @@ static mlir::RewriterRegistration rewriteSimpleRename("simple-rename", simpleRename); // Rewriter that insert range markers. -LogicalResult markRanges(RewritePad &rewriteState, raw_ostream &os) { +static LogicalResult markRanges(RewritePad &rewriteState, raw_ostream &os) { for (const auto &it : rewriteState.getOpDefs()) { auto [startOp, endOp] = getOpRange(it); diff --git a/mlir/unittests/TableGen/PassGenTest.cpp b/mlir/unittests/TableGen/PassGenTest.cpp index 27f2fa0..ac01d49 100644 --- a/mlir/unittests/TableGen/PassGenTest.cpp +++ b/mlir/unittests/TableGen/PassGenTest.cpp @@ -11,7 +11,8 @@ #include "gmock/gmock.h" -std::unique_ptr<mlir::Pass> createTestPassWithCustomConstructor(int v = 0); +static std::unique_ptr<mlir::Pass> +createTestPassWithCustomConstructor(int v = 0); #define GEN_PASS_DECL #define GEN_PASS_REGISTRATION diff --git a/orc-rt/include/orc-rt/SPSWrapperFunction.h b/orc-rt/include/orc-rt/SPSWrapperFunction.h index 3ea6406..14a3d8e 100644 --- a/orc-rt/include/orc-rt/SPSWrapperFunction.h +++ b/orc-rt/include/orc-rt/SPSWrapperFunction.h @@ -21,8 +21,10 @@ namespace orc_rt { namespace detail { template <typename... SPSArgTs> struct WFSPSHelper { - template <typename... ArgTs> - std::optional<WrapperFunctionBuffer> serialize(const ArgTs &...Args) { +private: + template <typename... SerializableArgTs> + std::optional<WrapperFunctionBuffer> + serializeImpl(const SerializableArgTs &...Args) { auto R = WrapperFunctionBuffer::allocate(SPSArgList<SPSArgTs...>::size(Args...)); SPSOutputBuffer OB(R.data(), R.size()); @@ -31,16 +33,61 @@ template <typename... SPSArgTs> struct WFSPSHelper { return std::move(R); } + template <typename T> static const T &toSerializable(const T &Arg) noexcept { + return Arg; + } + + static SPSSerializableError toSerializable(Error Err) noexcept { + return SPSSerializableError(std::move(Err)); + } + + template <typename T> + static SPSSerializableExpected<T> toSerializable(Expected<T> Arg) noexcept { + return SPSSerializableExpected<T>(std::move(Arg)); + } + + template <typename... Ts> struct DeserializableTuple; + + template <typename... Ts> struct DeserializableTuple<std::tuple<Ts...>> { + typedef std::tuple< + std::decay_t<decltype(toSerializable(std::declval<Ts>()))>...> + type; + }; + + template <typename... Ts> + using DeserializableTuple_t = typename DeserializableTuple<Ts...>::type; + + template <typename T> static T fromSerializable(T &&Arg) noexcept { + return Arg; + } + + static Error fromSerializable(SPSSerializableError Err) noexcept { + return Err.toError(); + } + + template <typename T> + static Expected<T> fromSerializable(SPSSerializableExpected<T> Val) noexcept { + return Val.toExpected(); + } + +public: + template <typename... ArgTs> + std::optional<WrapperFunctionBuffer> serialize(ArgTs &&...Args) { + return serializeImpl(toSerializable(std::forward<ArgTs>(Args))...); + } + template <typename ArgTuple> std::optional<ArgTuple> deserialize(WrapperFunctionBuffer ArgBytes) { assert(!ArgBytes.getOutOfBandError() && "Should not attempt to deserialize out-of-band error"); SPSInputBuffer IB(ArgBytes.data(), ArgBytes.size()); - ArgTuple Args; - if (!SPSSerializationTraits<SPSTuple<SPSArgTs...>, ArgTuple>::deserialize( - IB, Args)) + DeserializableTuple_t<ArgTuple> Args; + if (!SPSSerializationTraits<SPSTuple<SPSArgTs...>, + decltype(Args)>::deserialize(IB, Args)) return std::nullopt; - return Args; + return std::apply( + [](auto &&...A) { return ArgTuple(fromSerializable(A)...); }, + std::move(Args)); } }; diff --git a/orc-rt/include/orc-rt/WrapperFunction.h b/orc-rt/include/orc-rt/WrapperFunction.h index 233c3b2..ca165db 100644 --- a/orc-rt/include/orc-rt/WrapperFunction.h +++ b/orc-rt/include/orc-rt/WrapperFunction.h @@ -168,7 +168,8 @@ struct ResultDeserializer<std::tuple<Expected<T>>, Serializer> { Serializer &S) { if (auto Val = S.result().template deserialize<std::tuple<T>>( std::move(ResultBytes))) - return std::move(std::get<0>(*Val)); + return Expected<T>(std::move(std::get<0>(*Val)), + ForceExpectedSuccessValue()); else return make_error<StringError>("Could not deserialize result"); } diff --git a/orc-rt/unittests/SPSWrapperFunctionTest.cpp b/orc-rt/unittests/SPSWrapperFunctionTest.cpp index 0b65515..c0c86ff 100644 --- a/orc-rt/unittests/SPSWrapperFunctionTest.cpp +++ b/orc-rt/unittests/SPSWrapperFunctionTest.cpp @@ -144,3 +144,77 @@ TEST(SPSWrapperFunctionUtilsTest, TestBinaryOpViaFunctionPointer) { [&](Expected<int32_t> R) { Result = cantFail(std::move(R)); }, 41, 1); EXPECT_EQ(Result, 42); } + +static void improbable_feat_sps_wrapper(orc_rt_SessionRef Session, + void *CallCtx, + orc_rt_WrapperFunctionReturn Return, + orc_rt_WrapperFunctionBuffer ArgBytes) { + SPSWrapperFunction<SPSError(bool)>::handle( + Session, CallCtx, Return, ArgBytes, + [](move_only_function<void(Error)> Return, bool LuckyHat) { + if (LuckyHat) + Return(Error::success()); + else + Return(make_error<StringError>("crushed by boulder")); + }); +} + +TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningErrorSuccessCase) { + bool DidRun = false; + SPSWrapperFunction<SPSError(bool)>::call( + DirectCaller(nullptr, improbable_feat_sps_wrapper), + [&](Expected<Error> E) { + DidRun = true; + cantFail(cantFail(std::move(E))); + }, + true); + + EXPECT_TRUE(DidRun); +} + +TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningErrorFailureCase) { + std::string ErrMsg; + SPSWrapperFunction<SPSError(bool)>::call( + DirectCaller(nullptr, improbable_feat_sps_wrapper), + [&](Expected<Error> E) { ErrMsg = toString(cantFail(std::move(E))); }, + false); + + EXPECT_EQ(ErrMsg, "crushed by boulder"); +} + +static void halve_number_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx, + orc_rt_WrapperFunctionReturn Return, + orc_rt_WrapperFunctionBuffer ArgBytes) { + SPSWrapperFunction<SPSExpected<int32_t>(int32_t)>::handle( + Session, CallCtx, Return, ArgBytes, + [](move_only_function<void(Expected<int32_t>)> Return, int N) { + if (N % 2 == 0) + Return(N >> 1); + else + Return(make_error<StringError>("N is not a multiple of 2")); + }); +} + +TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningExpectedSuccessCase) { + int32_t Result = 0; + SPSWrapperFunction<SPSExpected<int32_t>(int32_t)>::call( + DirectCaller(nullptr, halve_number_sps_wrapper), + [&](Expected<Expected<int32_t>> R) { + Result = cantFail(cantFail(std::move(R))); + }, + 2); + + EXPECT_EQ(Result, 1); +} + +TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningExpectedFailureCase) { + std::string ErrMsg; + SPSWrapperFunction<SPSExpected<int32_t>(int32_t)>::call( + DirectCaller(nullptr, halve_number_sps_wrapper), + [&](Expected<Expected<int32_t>> R) { + ErrMsg = toString(cantFail(std::move(R)).takeError()); + }, + 3); + + EXPECT_EQ(ErrMsg, "N is not a multiple of 2"); +} |