aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bolt/include/bolt/Core/BinaryContext.h6
-rw-r--r--bolt/include/bolt/Passes/SplitFunctions.h19
-rw-r--r--bolt/include/bolt/Rewrite/MetadataRewriters.h2
-rw-r--r--bolt/include/bolt/Utils/CommandLineOpts.h20
-rw-r--r--bolt/lib/Passes/LongJmp.cpp4
-rw-r--r--bolt/lib/Passes/SplitFunctions.cpp35
-rw-r--r--bolt/lib/Rewrite/CMakeLists.txt1
-rw-r--r--bolt/lib/Rewrite/GNUPropertyRewriter.cpp147
-rw-r--r--bolt/lib/Rewrite/RewriteInstance.cpp9
-rw-r--r--bolt/lib/Utils/CommandLineOpts.cpp23
-rw-r--r--bolt/test/AArch64/Inputs/property-note-bti.yaml50
-rw-r--r--bolt/test/AArch64/Inputs/property-note-nobti.yaml50
-rw-r--r--bolt/test/AArch64/bti-note.test10
-rw-r--r--bolt/test/AArch64/no-bti-note.test10
-rw-r--r--bolt/test/AArch64/unsupported-passes.test7
-rw-r--r--clang/docs/InternalsManual.rst61
-rw-r--r--clang/docs/ReleaseNotes.rst6
-rw-r--r--clang/include/clang/AST/ASTConcept.h33
-rw-r--r--clang/include/clang/AST/ASTContext.h1
-rw-r--r--clang/include/clang/Sema/Sema.h111
-rw-r--r--clang/include/clang/Sema/SemaConcept.h434
-rw-r--r--clang/include/clang/Sema/Template.h22
-rw-r--r--clang/lib/AST/ASTConcept.cpp31
-rw-r--r--clang/lib/AST/ASTImporter.cpp12
-rw-r--r--clang/lib/Sema/SemaConcept.cpp2000
-rw-r--r--clang/lib/Sema/SemaDeclCXX.cpp16
-rw-r--r--clang/lib/Sema/SemaExprCXX.cpp16
-rw-r--r--clang/lib/Sema/SemaInit.cpp5
-rw-r--r--clang/lib/Sema/SemaOverload.cpp6
-rw-r--r--clang/lib/Sema/SemaTemplate.cpp93
-rw-r--r--clang/lib/Sema/SemaTemplateDeduction.cpp51
-rw-r--r--clang/lib/Sema/SemaTemplateDeductionGuide.cpp39
-rw-r--r--clang/lib/Sema/SemaTemplateInstantiate.cpp168
-rw-r--r--clang/lib/Sema/TreeTransform.h19
-rw-r--r--clang/lib/Serialization/ASTReaderDecl.cpp2
-rw-r--r--clang/lib/Serialization/ASTReaderStmt.cpp14
-rw-r--r--clang/lib/Serialization/ASTWriterStmt.cpp18
-rw-r--r--clang/test/AST/ast-dump-concepts.cpp10
-rw-r--r--clang/test/AST/ast-dump-ctad-alias.cpp21
-rw-r--r--clang/test/CXX/drs/cwg25xx.cpp14
-rw-r--r--clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp3
-rw-r--r--clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp14
-rw-r--r--clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp35
-rw-r--r--clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp4
-rw-r--r--clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp12
-rw-r--r--clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp5
-rw-r--r--clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp59
-rw-r--r--clang/test/CXX/temp/temp.param/p10-2a.cpp23
-rw-r--r--clang/test/SemaCXX/cxx20-ctad-type-alias.cpp17
-rw-r--r--clang/test/SemaCXX/cxx23-assume.cpp9
-rw-r--r--clang/test/SemaCXX/cxx2b-deducing-this.cpp8
-rw-r--r--clang/test/SemaCXX/cxx2c-fold-exprs.cpp202
-rw-r--r--clang/test/SemaCXX/cxx2c-template-template-param.cpp4
-rw-r--r--clang/test/SemaCXX/invalid-requirement-requires-expr.cpp4
-rw-r--r--clang/test/SemaCXX/overload-resolution-deferred-templates.cpp3
-rw-r--r--clang/test/SemaCXX/type-traits.cpp4
-rw-r--r--clang/test/SemaHLSL/BuiltIns/Buffers.hlsl6
-rw-r--r--clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl6
-rw-r--r--clang/test/SemaTemplate/GH161657.cpp2
-rw-r--r--clang/test/SemaTemplate/concepts-recovery-expr.cpp32
-rw-r--r--clang/test/SemaTemplate/concepts-recursive-inst.cpp27
-rw-r--r--clang/test/SemaTemplate/concepts.cpp71
-rw-r--r--clang/test/SemaTemplate/deduction-guide.cpp15
-rw-r--r--clang/test/SemaTemplate/instantiate-abbreviated-template.cpp1
-rw-r--r--clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp4
-rw-r--r--clang/test/SemaTemplate/instantiate-requires-expr.cpp20
-rw-r--r--clang/test/SemaTemplate/instantiate-template-argument.cpp97
-rw-r--r--clang/test/SemaTemplate/pr52970.cpp2
-rw-r--r--flang-rt/lib/runtime/character.cpp4
-rw-r--r--flang/lib/Optimizer/Transforms/AddDebugInfo.cpp9
-rw-r--r--flang/test/Transforms/debug-module-3.fir13
-rw-r--r--libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp4
-rw-r--r--llvm/include/llvm/IR/PatternMatch.h8
-rw-r--r--llvm/include/llvm/Transforms/Scalar/GVN.h2
-rw-r--r--llvm/lib/ExecutionEngine/JITLink/JITLink.cpp3
-rw-r--r--llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp30
-rw-r--r--llvm/lib/Object/BuildID.cpp26
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp121
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td16
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h10
-rw-r--r--llvm/lib/Target/RISCV/RISCVGISel.td10
-rw-r--r--llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp4
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp3
-rw-r--r--llvm/lib/Transforms/Scalar/GVN.cpp33
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp10
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h2
-rw-r--r--llvm/runtimes/CMakeLists.txt8
-rw-r--r--llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/stack-hazard.ll272
-rw-r--r--llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir48
-rw-r--r--llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir12
-rw-r--r--llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir12
-rw-r--r--llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir16
-rw-r--r--llvm/test/CodeGen/AMDGPU/wait-xcnt.mir9
-rw-r--r--llvm/test/CodeGen/ARM/and-mask-variable.ll90
-rw-r--r--llvm/test/CodeGen/ARM/extract-bits.ll4591
-rw-r--r--llvm/test/CodeGen/ARM/extract-lowbits.ll2752
-rw-r--r--llvm/test/CodeGen/X86/isel-fpclass.ll256
-rw-r--r--llvm/test/CodeGen/X86/isel-smax.ll244
-rw-r--r--llvm/test/CodeGen/X86/isel-smin.ll244
-rw-r--r--llvm/test/CodeGen/X86/isel-umax.ll244
-rw-r--r--llvm/test/CodeGen/X86/isel-umin.ll244
-rw-r--r--llvm/test/CodeGen/X86/pr161693.ll40
-rw-r--r--llvm/test/DebugInfo/symbolize-build-id.test1
-rw-r--r--llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s2
-rw-r--r--llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-0.s7
-rw-r--r--llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-1.s7
-rw-r--r--llvm/test/ExecutionEngine/JITLink/AArch64/MachO_universal_slice_selection.s32
-rw-r--r--llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s2
-rw-r--r--llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s2
-rw-r--r--llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll34
-rw-r--r--llvm/test/Transforms/GVN/masked-load-store.ll178
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll162
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll100
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll638
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll94
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll94
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll2
-rw-r--r--llvm/tools/llvm-jitlink/llvm-jitlink.cpp6
-rw-r--r--llvm/unittests/Object/BuildIDTest.cpp120
-rw-r--r--llvm/unittests/Object/CMakeLists.txt1
-rw-r--r--llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp2
-rw-r--r--llvm/unittests/Transforms/Vectorize/VPlanUncountableExitTest.cpp4
-rw-r--r--llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn1
-rw-r--r--mlir/lib/CAPI/Transforms/Rewrite.cpp13
-rw-r--r--mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp21
-rw-r--r--mlir/tools/mlir-rewrite/mlir-rewrite.cpp12
-rw-r--r--mlir/unittests/TableGen/PassGenTest.cpp3
-rw-r--r--orc-rt/include/orc-rt/SPSWrapperFunction.h59
-rw-r--r--orc-rt/include/orc-rt/WrapperFunction.h3
-rw-r--r--orc-rt/unittests/SPSWrapperFunctionTest.cpp74
141 files changed, 12735 insertions, 2678 deletions
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 082f1ce..8960b19 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -190,6 +190,9 @@ class BinaryContext {
/// Unique build ID if available for the binary.
std::optional<std::string> FileBuildID;
+ /// GNU property note indicating AArch64 BTI.
+ bool UsesBTI{false};
+
/// Set of all sections.
struct CompareSections {
bool operator()(const BinarySection *A, const BinarySection *B) const {
@@ -384,6 +387,9 @@ public:
}
void setFileBuildID(StringRef ID) { FileBuildID = std::string(ID); }
+ bool usesBTI() const { return UsesBTI; }
+ void setUsesBTI(bool Value) { UsesBTI = Value; }
+
bool hasSymbolsWithFileName() const { return HasSymbolsWithFileName; }
void setHasSymbolsWithFileName(bool Value) { HasSymbolsWithFileName = Value; }
diff --git a/bolt/include/bolt/Passes/SplitFunctions.h b/bolt/include/bolt/Passes/SplitFunctions.h
index 8bdc48b..2c1bf18 100644
--- a/bolt/include/bolt/Passes/SplitFunctions.h
+++ b/bolt/include/bolt/Passes/SplitFunctions.h
@@ -18,25 +18,6 @@
namespace llvm {
namespace bolt {
-/// Strategy used to partition blocks into fragments.
-enum SplitFunctionsStrategy : char {
- /// Split each function into a hot and cold fragment using profiling
- /// information.
- Profile2 = 0,
- /// Split each function into a hot, warm, and cold fragment using
- /// profiling information.
- CDSplit,
- /// Split each function into a hot and cold fragment at a randomly chosen
- /// split point (ignoring any available profiling information).
- Random2,
- /// Split each function into N fragments at a randomly chosen split points
- /// (ignoring any available profiling information).
- RandomN,
- /// Split all basic blocks of each function into fragments such that each
- /// fragment contains exactly a single basic block.
- All
-};
-
class SplitStrategy {
public:
using BlockIt = BinaryFunction::BasicBlockOrderType::iterator;
diff --git a/bolt/include/bolt/Rewrite/MetadataRewriters.h b/bolt/include/bolt/Rewrite/MetadataRewriters.h
index b71bd6c..2c09c879 100644
--- a/bolt/include/bolt/Rewrite/MetadataRewriters.h
+++ b/bolt/include/bolt/Rewrite/MetadataRewriters.h
@@ -27,6 +27,8 @@ std::unique_ptr<MetadataRewriter> createPseudoProbeRewriter(BinaryContext &);
std::unique_ptr<MetadataRewriter> createSDTRewriter(BinaryContext &);
+std::unique_ptr<MetadataRewriter> createGNUPropertyRewriter(BinaryContext &);
+
} // namespace bolt
} // namespace llvm
diff --git a/bolt/include/bolt/Utils/CommandLineOpts.h b/bolt/include/bolt/Utils/CommandLineOpts.h
index 859d6f3..0964c2c 100644
--- a/bolt/include/bolt/Utils/CommandLineOpts.h
+++ b/bolt/include/bolt/Utils/CommandLineOpts.h
@@ -29,6 +29,25 @@ enum HeatmapModeKind {
HM_Optional // perf2bolt --heatmap
};
+/// Strategy used to partition blocks into fragments.
+enum SplitFunctionsStrategy : char {
+ /// Split each function into a hot and cold fragment using profiling
+ /// information.
+ Profile2 = 0,
+ /// Split each function into a hot, warm, and cold fragment using
+ /// profiling information.
+ CDSplit,
+ /// Split each function into a hot and cold fragment at a randomly chosen
+ /// split point (ignoring any available profiling information).
+ Random2,
+ /// Split each function into N fragments at a randomly chosen split points
+ /// (ignoring any available profiling information).
+ RandomN,
+ /// Split all basic blocks of each function into fragments such that each
+ /// fragment contains exactly a single basic block.
+ All
+};
+
using HeatmapBlockSizes = std::vector<unsigned>;
struct HeatmapBlockSpecParser : public llvm::cl::parser<HeatmapBlockSizes> {
explicit HeatmapBlockSpecParser(llvm::cl::Option &O)
@@ -78,6 +97,7 @@ extern llvm::cl::opt<std::string> OutputFilename;
extern llvm::cl::opt<std::string> PerfData;
extern llvm::cl::opt<bool> PrintCacheMetrics;
extern llvm::cl::opt<bool> PrintSections;
+extern llvm::cl::opt<SplitFunctionsStrategy> SplitStrategy;
// The format to use with -o in aggregation mode (perf2bolt)
enum ProfileFormatKind { PF_Fdata, PF_YAML };
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index 4dade16..03c1ea9 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -895,6 +895,10 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
+ assert((opts::CompactCodeModel ||
+ opts::SplitStrategy != opts::SplitFunctionsStrategy::CDSplit) &&
+ "LongJmp cannot work with functions split in more than two fragments");
+
if (opts::CompactCodeModel) {
BC.outs()
<< "BOLT-INFO: relaxing branches for compact code model (<128MB)\n";
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
index b21401e..eab669b 100644
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -86,29 +86,6 @@ static cl::opt<unsigned> SplitThreshold(
"increase after splitting."),
cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
-static cl::opt<SplitFunctionsStrategy> SplitStrategy(
- "split-strategy", cl::init(SplitFunctionsStrategy::Profile2),
- cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2",
- "split each function into a hot and cold fragment "
- "using profiling information")),
- cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit",
- "split each function into a hot, warm, and cold "
- "fragment using profiling information")),
- cl::values(clEnumValN(
- SplitFunctionsStrategy::Random2, "random2",
- "split each function into a hot and cold fragment at a randomly chosen "
- "split point (ignoring any available profiling information)")),
- cl::values(clEnumValN(
- SplitFunctionsStrategy::RandomN, "randomN",
- "split each function into N fragments at a randomly chosen split "
- "points (ignoring any available profiling information)")),
- cl::values(clEnumValN(
- SplitFunctionsStrategy::All, "all",
- "split all basic blocks of each function into fragments such that each "
- "fragment contains exactly a single basic block")),
- cl::desc("strategy used to partition blocks into fragments"),
- cl::cat(BoltOptCategory));
-
static cl::opt<double> CallScale(
"call-scale",
cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"),
@@ -724,14 +701,14 @@ Error SplitFunctions::runOnFunctions(BinaryContext &BC) {
// If split strategy is not CDSplit, then a second run of the pass is not
// needed after function reordering.
if (BC.HasFinalizedFunctionOrder &&
- opts::SplitStrategy != SplitFunctionsStrategy::CDSplit)
+ opts::SplitStrategy != opts::SplitFunctionsStrategy::CDSplit)
return Error::success();
std::unique_ptr<SplitStrategy> Strategy;
bool ForceSequential = false;
switch (opts::SplitStrategy) {
- case SplitFunctionsStrategy::CDSplit:
+ case opts::SplitFunctionsStrategy::CDSplit:
// CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2)
// before function reordering and hot-warm-cold splitting
// (SplitCacheDirected) after function reordering.
@@ -742,21 +719,21 @@ Error SplitFunctions::runOnFunctions(BinaryContext &BC) {
opts::AggressiveSplitting = true;
BC.HasWarmSection = true;
break;
- case SplitFunctionsStrategy::Profile2:
+ case opts::SplitFunctionsStrategy::Profile2:
Strategy = std::make_unique<SplitProfile2>();
break;
- case SplitFunctionsStrategy::Random2:
+ case opts::SplitFunctionsStrategy::Random2:
Strategy = std::make_unique<SplitRandom2>();
// If we split functions randomly, we need to ensure that across runs with
// the same input, we generate random numbers for each function in the same
// order.
ForceSequential = true;
break;
- case SplitFunctionsStrategy::RandomN:
+ case opts::SplitFunctionsStrategy::RandomN:
Strategy = std::make_unique<SplitRandomN>();
ForceSequential = true;
break;
- case SplitFunctionsStrategy::All:
+ case opts::SplitFunctionsStrategy::All:
Strategy = std::make_unique<SplitAll>();
break;
}
diff --git a/bolt/lib/Rewrite/CMakeLists.txt b/bolt/lib/Rewrite/CMakeLists.txt
index 7750360..5b15edc 100644
--- a/bolt/lib/Rewrite/CMakeLists.txt
+++ b/bolt/lib/Rewrite/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_library(LLVMBOLTRewrite
PseudoProbeRewriter.cpp
RewriteInstance.cpp
SDTRewriter.cpp
+ GNUPropertyRewriter.cpp
NO_EXPORT
DISABLE_LLVM_LINK_LLVM_DYLIB
diff --git a/bolt/lib/Rewrite/GNUPropertyRewriter.cpp b/bolt/lib/Rewrite/GNUPropertyRewriter.cpp
new file mode 100644
index 0000000..f61c08e
--- /dev/null
+++ b/bolt/lib/Rewrite/GNUPropertyRewriter.cpp
@@ -0,0 +1,147 @@
+//===- bolt/Rewrite/GNUPropertyRewriter.cpp -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Read the .note.gnu.property section.
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Rewrite/MetadataRewriter.h"
+#include "bolt/Rewrite/MetadataRewriters.h"
+#include "llvm/Support/Errc.h"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace {
+
+class GNUPropertyRewriter final : public MetadataRewriter {
+
+ Expected<uint32_t> decodeGNUPropertyNote(StringRef Desc);
+
+public:
+ GNUPropertyRewriter(StringRef Name, BinaryContext &BC)
+ : MetadataRewriter(Name, BC) {}
+
+ Error sectionInitializer() override;
+};
+
+Error GNUPropertyRewriter::sectionInitializer() {
+
+ ErrorOr<BinarySection &> Sec =
+ BC.getUniqueSectionByName(".note.gnu.property");
+ if (!Sec)
+ return Error::success();
+
+ // Accumulate feature bits
+ uint32_t FeaturesAcc = 0;
+
+ StringRef Buf = Sec->getContents();
+ DataExtractor DE(Buf, BC.AsmInfo->isLittleEndian(),
+ BC.AsmInfo->getCodePointerSize());
+ DataExtractor::Cursor Cursor(0);
+ while (Cursor && !DE.eof(Cursor)) {
+ const uint32_t NameSz = DE.getU32(Cursor);
+ const uint32_t DescSz = DE.getU32(Cursor);
+ const uint32_t Type = DE.getU32(Cursor);
+
+ StringRef Name =
+ NameSz ? Buf.slice(Cursor.tell(), Cursor.tell() + NameSz) : "<empty>";
+ Cursor.seek(alignTo(Cursor.tell() + NameSz, 4));
+
+ const uint64_t DescOffset = Cursor.tell();
+ StringRef Desc =
+ DescSz ? Buf.slice(DescOffset, DescOffset + DescSz) : "<empty>";
+ Cursor.seek(alignTo(DescOffset + DescSz, 4));
+ if (!Cursor)
+ return createStringError(
+ errc::executable_format_error,
+ "out of bounds while reading .note.gnu.property section: %s",
+ toString(Cursor.takeError()).c_str());
+
+ if (Type == ELF::NT_GNU_PROPERTY_TYPE_0 && Name.starts_with("GNU") &&
+ DescSz) {
+ auto Features = decodeGNUPropertyNote(Desc);
+ if (!Features)
+ return Features.takeError();
+ FeaturesAcc |= *Features;
+ }
+ }
+
+ if (BC.isAArch64()) {
+ BC.setUsesBTI(FeaturesAcc & llvm::ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI);
+ if (BC.usesBTI())
+ BC.outs() << "BOLT-WARNING: binary is using BTI. Optimized binary may be "
+ "corrupted\n";
+ }
+
+ return Error::success();
+}
+
+/// \p Desc contains an array of property descriptors. Each member has the
+/// following structure:
+/// typedef struct {
+/// Elf_Word pr_type;
+/// Elf_Word pr_datasz;
+/// unsigned char pr_data[PR_DATASZ];
+/// unsigned char pr_padding[PR_PADDING];
+/// } Elf_Prop;
+///
+/// As there is no guarantee that the features are encoded in which element of
+/// the array, we have to read all, and OR together the result.
+Expected<uint32_t> GNUPropertyRewriter::decodeGNUPropertyNote(StringRef Desc) {
+ DataExtractor DE(Desc, BC.AsmInfo->isLittleEndian(),
+ BC.AsmInfo->getCodePointerSize());
+ DataExtractor::Cursor Cursor(0);
+ const uint32_t Align = DE.getAddressSize();
+
+ std::optional<uint32_t> Features = 0;
+ while (Cursor && !DE.eof(Cursor)) {
+ const uint32_t PrType = DE.getU32(Cursor);
+ const uint32_t PrDataSz = DE.getU32(Cursor);
+
+ const uint64_t PrDataStart = Cursor.tell();
+ const uint64_t PrDataEnd = PrDataStart + PrDataSz;
+ Cursor.seek(PrDataEnd);
+ if (!Cursor)
+ return createStringError(
+ errc::executable_format_error,
+ "out of bounds while reading .note.gnu.property section: %s",
+ toString(Cursor.takeError()).c_str());
+
+ if (PrType == llvm::ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND) {
+ if (PrDataSz != 4) {
+ return createStringError(
+ errc::executable_format_error,
+ "Property descriptor size has to be 4 bytes on AArch64\n");
+ }
+ DataExtractor::Cursor Tmp(PrDataStart);
+ // PrDataSz = 4 -> PrData is uint32_t
+ const uint32_t FeaturesItem = DE.getU32(Tmp);
+ if (!Tmp)
+ return createStringError(
+ errc::executable_format_error,
+ "failed to read property from .note.gnu.property section: %s",
+ toString(Tmp.takeError()).c_str());
+ Features = Features ? (*Features | FeaturesItem) : FeaturesItem;
+ }
+
+ Cursor.seek(alignTo(PrDataEnd, Align));
+ if (!Cursor)
+ return createStringError(errc::executable_format_error,
+ "out of bounds while reading property array in "
+ ".note.gnu.property section: %s",
+ toString(Cursor.takeError()).c_str());
+ }
+ return Features.value_or(0u);
+}
+} // namespace
+
+std::unique_ptr<MetadataRewriter>
+llvm::bolt::createGNUPropertyRewriter(BinaryContext &BC) {
+ return std::make_unique<GNUPropertyRewriter>("gnu-property-rewriter", BC);
+}
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index c13a9f0..bfd03e0 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -2115,6 +2115,13 @@ void RewriteInstance::adjustCommandLineOptions() {
opts::SplitEH = false;
}
+ if (BC->isAArch64() && !opts::CompactCodeModel &&
+ opts::SplitStrategy == opts::SplitFunctionsStrategy::CDSplit) {
+ BC->errs() << "BOLT-ERROR: CDSplit is not supported with LongJmp. Try with "
+ "'--compact-code-model'\n";
+ exit(1);
+ }
+
if (opts::StrictMode && !BC->HasRelocations) {
BC->errs()
<< "BOLT-WARNING: disabling strict mode (-strict) in non-relocation "
@@ -3331,6 +3338,8 @@ void RewriteInstance::initializeMetadataManager() {
MetadataManager.registerRewriter(createPseudoProbeRewriter(*BC));
MetadataManager.registerRewriter(createSDTRewriter(*BC));
+
+ MetadataManager.registerRewriter(createGNUPropertyRewriter(*BC));
}
void RewriteInstance::processSectionMetadata() {
diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp
index 5635da4..095612a 100644
--- a/bolt/lib/Utils/CommandLineOpts.cpp
+++ b/bolt/lib/Utils/CommandLineOpts.cpp
@@ -104,6 +104,29 @@ ExecutionCountThreshold("execution-count-threshold",
cl::Hidden,
cl::cat(BoltOptCategory));
+cl::opt<SplitFunctionsStrategy> SplitStrategy(
+ "split-strategy", cl::init(SplitFunctionsStrategy::Profile2),
+ cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2",
+ "split each function into a hot and cold fragment "
+ "using profiling information")),
+ cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit",
+ "split each function into a hot, warm, and cold "
+ "fragment using profiling information")),
+ cl::values(clEnumValN(
+ SplitFunctionsStrategy::Random2, "random2",
+ "split each function into a hot and cold fragment at a randomly chosen "
+ "split point (ignoring any available profiling information)")),
+ cl::values(clEnumValN(
+ SplitFunctionsStrategy::RandomN, "randomN",
+ "split each function into N fragments at a randomly chosen split "
+ "points (ignoring any available profiling information)")),
+ cl::values(clEnumValN(
+ SplitFunctionsStrategy::All, "all",
+ "split all basic blocks of each function into fragments such that each "
+ "fragment contains exactly a single basic block")),
+ cl::desc("strategy used to partition blocks into fragments"),
+ cl::cat(BoltOptCategory));
+
bool HeatmapBlockSpecParser::parse(cl::Option &O, StringRef ArgName,
StringRef Arg, HeatmapBlockSizes &Val) {
// Parses a human-readable suffix into a shift amount or nullopt on error.
diff --git a/bolt/test/AArch64/Inputs/property-note-bti.yaml b/bolt/test/AArch64/Inputs/property-note-bti.yaml
new file mode 100644
index 0000000..541ae92
--- /dev/null
+++ b/bolt/test/AArch64/Inputs/property-note-bti.yaml
@@ -0,0 +1,50 @@
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_EXEC
+ Machine: EM_AARCH64
+ Entry: 0x400510
+ProgramHeaders:
+ - Type: PT_NOTE
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.property
+ LastSec: .note.gnu.property
+ VAddr: 0x400338
+ Align: 0x8
+ - Type: PT_LOAD
+ Flags: [ PF_R ]
+ VAddr: 0x0
+ Align: 0x10000
+ FileSize: 0xf8
+ MemSize: 0xf8
+ Offset: 0x0
+Sections:
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x2a0000
+ AddressAlign: 0x4
+ Content: 400580d2c0035fd6
+ - Name: .note.gnu.property
+ Type: SHT_NOTE
+ Flags: [ SHF_ALLOC ]
+ Address: 0x400338
+ AddressAlign: 0x8
+ Notes:
+ - Name: GNU
+ Desc: 000000C0040000000300000000000000
+ Type: NT_GNU_PROPERTY_TYPE_0
+ - Type: SectionHeaderTable
+ Sections:
+ - Name: .note.gnu.property
+ - Name: .symtab
+ - Name: .strtab
+ - Name: .shstrtab
+ - Name: .text
+Symbols:
+ - Name: .note.gnu.property
+ Type: STT_SECTION
+ Section: .note.gnu.property
+ Value: 0x400338
+...
diff --git a/bolt/test/AArch64/Inputs/property-note-nobti.yaml b/bolt/test/AArch64/Inputs/property-note-nobti.yaml
new file mode 100644
index 0000000..a041a58
--- /dev/null
+++ b/bolt/test/AArch64/Inputs/property-note-nobti.yaml
@@ -0,0 +1,50 @@
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_EXEC
+ Machine: EM_AARCH64
+ Entry: 0x400510
+ProgramHeaders:
+ - Type: PT_NOTE
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.property
+ LastSec: .note.gnu.property
+ VAddr: 0x400338
+ Align: 0x8
+ - Type: PT_LOAD
+ Flags: [ PF_R ]
+ VAddr: 0x0
+ Align: 0x10000
+ FileSize: 0xf8
+ MemSize: 0xf8
+ Offset: 0x0
+Sections:
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x2a0000
+ AddressAlign: 0x4
+ Content: 400580d2c0035fd6
+ - Name: .note.gnu.property
+ Type: SHT_NOTE
+ Flags: [ SHF_ALLOC ]
+ Address: 0x400338
+ AddressAlign: 0x8
+ Notes:
+ - Name: GNU
+ Desc: 000000C0040000000200000000000000
+ Type: NT_GNU_PROPERTY_TYPE_0
+ - Type: SectionHeaderTable
+ Sections:
+ - Name: .note.gnu.property
+ - Name: .symtab
+ - Name: .strtab
+ - Name: .shstrtab
+ - Name: .text
+Symbols:
+ - Name: .note.gnu.property
+ Type: STT_SECTION
+ Section: .note.gnu.property
+ Value: 0x400338
+...
diff --git a/bolt/test/AArch64/bti-note.test b/bolt/test/AArch64/bti-note.test
new file mode 100644
index 0000000..1ec9d77
--- /dev/null
+++ b/bolt/test/AArch64/bti-note.test
@@ -0,0 +1,10 @@
+// This test checks that the GNUPropertyRewriter can decode the BTI feature flag.
+// It decodes an executable with BTI, and checks for the warning.
+
+RUN: yaml2obj %p/Inputs/property-note-bti.yaml &> %t.exe
+
+RUN: llvm-readelf -n %t.exe | FileCheck %s
+CHECK: BTI
+
+RUN: llvm-bolt %t.exe -o %t.exe.bolt | FileCheck %s -check-prefix=CHECK-BOLT
+CHECK-BOLT: BOLT-WARNING: binary is using BTI. Optimized binary may be corrupted
diff --git a/bolt/test/AArch64/no-bti-note.test b/bolt/test/AArch64/no-bti-note.test
new file mode 100644
index 0000000..28cce34
--- /dev/null
+++ b/bolt/test/AArch64/no-bti-note.test
@@ -0,0 +1,10 @@
+// This test checks that the GNUPropertyRewriter can decode the BTI feature flag.
+// It decodes an executable without BTI, and checks for the warning.
+
+RUN: yaml2obj %p/Inputs/property-note-nobti.yaml &> %t.exe
+
+RUN: llvm-readelf -n %t.exe | FileCheck %s
+CHECK-NOT: BTI
+
+RUN: llvm-bolt %t.exe -o %t.exe.bolt | FileCheck %s -check-prefix=CHECK-BOLT
+CHECK-BOLT-NOT: BOLT-WARNING: binary is using BTI. Optimized binary may be corrupted
diff --git a/bolt/test/AArch64/unsupported-passes.test b/bolt/test/AArch64/unsupported-passes.test
index 886fc1c..5b12d86 100644
--- a/bolt/test/AArch64/unsupported-passes.test
+++ b/bolt/test/AArch64/unsupported-passes.test
@@ -3,6 +3,9 @@
// REQUIRES: system-linux,asserts,target=aarch64{{.*}}
RUN: %clang %cflags %p/../Inputs/hello.c -o %t -Wl,-q
-RUN: not llvm-bolt %t -o %t.bolt --frame-opt=all 2>&1 | FileCheck %s
+RUN: not llvm-bolt %t -o %t.bolt --frame-opt=all 2>&1 | FileCheck %s --check-prefix=CHECK-FRAME-OPT
-CHECK: BOLT-ERROR: frame-optimizer is supported only on X86
+CHECK-FRAME-OPT: BOLT-ERROR: frame-optimizer is supported only on X86
+
+RUN: not llvm-bolt %t -o %t.bolt split-functions --split-strategy=cdsplit 2>&1 | FileCheck %s --check-prefix=CHECK-CDSPLIT
+CHECK-CDSPLIT: BOLT-ERROR: CDSplit is not supported with LongJmp. Try with '--compact-code-model'
diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst
index bd74227..c677ddfa 100644
--- a/clang/docs/InternalsManual.rst
+++ b/clang/docs/InternalsManual.rst
@@ -2859,6 +2859,67 @@ This library is called by the :ref:`Parser library <Parser>` during parsing to
do semantic analysis of the input. For valid programs, Sema builds an AST for
parsed constructs.
+
+Concept Satisfaction Checking and Subsumption
+---------------------------------------------
+
+As per the C++ standard, constraints are `normalized <https://eel.is/c++draft/temp.constr.normal>`_
+and the normal form is used both for subsumption, and constraint checking.
+Both depend on a parameter mapping that substitutes lazily. In particular,
+we should not substitute in unused arguments.
+
+Clang follows the order of operations prescribed by the standard.
+
+Normalization happens prior to satisfaction and subsumption
+and is handled by ``NormalizedConstraint``.
+
+Clang preserves in the normalized form intermediate concept-ids
+(``ConceptIdConstraint``) This is used for diagnostics only and no substitution
+happens in a ConceptIdConstraint if its expression is satisfied.
+
+The normal form of the associated constraints of a declaration is cached in
+Sema::NormalizationCache such that it is only computed once.
+
+A ``NormalizedConstraint`` is a recursive data structure, where each node
+contains a parameter mapping, represented by the indexes of all parameter
+being used.
+
+Checking satisfaction is done by ``ConstraintSatisfactionChecker``, recursively
+walking ``NormalizedConstraint``. At each level, we substitute the outermost
+level of the template arguments referenced in the parameter mapping of a
+normalized expression (``MultiLevelTemplateArgumentList``).
+
+For the following example,
+
+.. code-block:: c++
+
+ template <typename T>
+ concept A = __is_same(T, int);
+
+ template <typename U>
+ concept B = A<U> && __is_same(U, int);
+
+The normal form of B is
+
+.. code-block:: c++
+
+ __is_same(T, int) /*T->U, innermost level*/
+ && __is_same(U, int) {U->U} /*T->U, outermost level*/
+
+After substitution in the mapping, we substitute in the constraint expression
+using that copy of the ``MultiLevelTemplateArgumentList``, and then evaluate it.
+
+Because this is expensive, it is cached in
+``UnsubstitutedConstraintSatisfactionCache``.
+
+Any error during satisfaction is recorded in ``ConstraintSatisfaction``.
+for nested requirements, ``ConstraintSatisfaction`` is stored (including
+diagnostics) in the AST, which is something we might want to improve.
+
+When an atomic constraint is not satified, we try to substitute into any
+enclosing concept-id using the same mechanism described above, for
+diagnostics purpose, and inject that in the ``ConstraintSatisfaction``.
+
.. _CodeGen:
The CodeGen Library
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 145a83a..d2e5bd2 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -160,6 +160,10 @@ C++23 Feature Support
C++20 Feature Support
^^^^^^^^^^^^^^^^^^^^^
+- Clang now normalizes constraints before checking whether they are satisfied, as mandated by the standard.
+ As a result, Clang no longer incorrectly diagnoses substitution failures in template arguments only
+ used in concept-ids, and produces better diagnostics for satisfaction failure. (#GH61811) (#GH135190)
+
C++17 Feature Support
^^^^^^^^^^^^^^^^^^^^^
@@ -361,7 +365,7 @@ Bug Fixes in This Version
first parameter. (#GH113323).
- Fixed a crash with incompatible pointer to integer conversions in designated
initializers involving string literals. (#GH154046)
-- Fix crash on CTAD for alias template. (#GH131342)
+- Fix crash on CTAD for alias template. (#GH131342), (#GH131408)
- Clang now emits a frontend error when a function marked with the `flatten` attribute
calls another function that requires target features not enabled in the caller. This
prevents a fatal error in the backend.
diff --git a/clang/include/clang/AST/ASTConcept.h b/clang/include/clang/AST/ASTConcept.h
index 72da005..f362f24 100644
--- a/clang/include/clang/AST/ASTConcept.h
+++ b/clang/include/clang/AST/ASTConcept.h
@@ -28,10 +28,20 @@ namespace clang {
class ConceptDecl;
class TemplateDecl;
+class ConceptReference;
class Expr;
class NamedDecl;
struct PrintingPolicy;
+/// Unsatisfied constraint expressions if the template arguments could be
+/// substituted into them, or a diagnostic if substitution resulted in
+/// an invalid expression.
+///
+using ConstraintSubstitutionDiagnostic = std::pair<SourceLocation, StringRef>;
+using UnsatisfiedConstraintRecord =
+ llvm::PointerUnion<const Expr *, const ConceptReference *,
+ const ConstraintSubstitutionDiagnostic *>;
+
/// The result of a constraint satisfaction check, containing the necessary
/// information to diagnose an unsatisfied constraint.
class ConstraintSatisfaction : public llvm::FoldingSetNode {
@@ -48,16 +58,13 @@ public:
ArrayRef<TemplateArgument> TemplateArgs)
: ConstraintOwner(ConstraintOwner), TemplateArgs(TemplateArgs) {}
- using SubstitutionDiagnostic = std::pair<SourceLocation, StringRef>;
- using Detail = llvm::PointerUnion<Expr *, SubstitutionDiagnostic *>;
-
bool IsSatisfied = false;
bool ContainsErrors = false;
/// \brief The substituted constraint expr, if the template arguments could be
/// substituted into them, or a diagnostic if substitution resulted in an
/// invalid expression.
- llvm::SmallVector<Detail, 4> Details;
+ llvm::SmallVector<UnsatisfiedConstraintRecord, 4> Details;
void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &C) {
Profile(ID, C, ConstraintOwner, TemplateArgs);
@@ -69,19 +76,12 @@ public:
bool HasSubstitutionFailure() {
for (const auto &Detail : Details)
- if (Detail.dyn_cast<SubstitutionDiagnostic *>())
+ if (Detail.dyn_cast<const ConstraintSubstitutionDiagnostic *>())
return true;
return false;
}
};
-/// Pairs of unsatisfied atomic constraint expressions along with the
-/// substituted constraint expr, if the template arguments could be
-/// substituted into them, or a diagnostic if substitution resulted in
-/// an invalid expression.
-using UnsatisfiedConstraintRecord =
- llvm::PointerUnion<Expr *, std::pair<SourceLocation, StringRef> *>;
-
/// \brief The result of a constraint satisfaction check, containing the
/// necessary information to diagnose an unsatisfied constraint.
///
@@ -101,6 +101,10 @@ struct ASTConstraintSatisfaction final :
return getTrailingObjects() + NumRecords;
}
+ ArrayRef<UnsatisfiedConstraintRecord> records() const {
+ return {begin(), end()};
+ }
+
ASTConstraintSatisfaction(const ASTContext &C,
const ConstraintSatisfaction &Satisfaction);
ASTConstraintSatisfaction(const ASTContext &C,
@@ -282,6 +286,11 @@ public:
}
};
+/// Insertion operator for diagnostics. This allows sending ConceptReferences's
+/// into a diagnostic with <<.
+const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB,
+ const ConceptReference *C);
+
} // clang
#endif // LLVM_CLANG_AST_ASTCONCEPT_H
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 12351e9..78220d4 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -3877,7 +3877,6 @@ typename clang::LazyGenerationalUpdatePtr<Owner, T, Update>::ValueType
return new (Ctx) LazyData(Source, Value);
return Value;
}
-
template <> struct llvm::DenseMapInfo<llvm::FoldingSetNodeID> {
static FoldingSetNodeID getEmptyKey() { return FoldingSetNodeID{}; }
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index f53aafd..265462a 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -65,6 +65,7 @@
#include "clang/Sema/Redeclaration.h"
#include "clang/Sema/Scope.h"
#include "clang/Sema/SemaBase.h"
+#include "clang/Sema/SemaConcept.h"
#include "clang/Sema/TypoCorrection.h"
#include "clang/Sema/Weak.h"
#include "llvm/ADT/APInt.h"
@@ -11694,8 +11695,9 @@ public:
ExprResult
CheckConceptTemplateId(const CXXScopeSpec &SS, SourceLocation TemplateKWLoc,
const DeclarationNameInfo &ConceptNameInfo,
- NamedDecl *FoundDecl, ConceptDecl *NamedConcept,
- const TemplateArgumentListInfo *TemplateArgs);
+ NamedDecl *FoundDecl, TemplateDecl *NamedConcept,
+ const TemplateArgumentListInfo *TemplateArgs,
+ bool DoCheckConstraintSatisfaction = true);
void diagnoseMissingTemplateArguments(TemplateName Name, SourceLocation Loc);
void diagnoseMissingTemplateArguments(const CXXScopeSpec &SS,
@@ -12025,6 +12027,13 @@ public:
bool UpdateArgsWithConversions = true,
bool *ConstraintsNotSatisfied = nullptr);
+ bool CheckTemplateArgumentList(
+ TemplateDecl *Template, TemplateParameterList *Params,
+ SourceLocation TemplateLoc, TemplateArgumentListInfo &TemplateArgs,
+ const DefaultArguments &DefaultArgs, bool PartialTemplateArgs,
+ CheckTemplateArgumentInfo &CTAI, bool UpdateArgsWithConversions = true,
+ bool *ConstraintsNotSatisfied = nullptr);
+
bool CheckTemplateTypeArgument(
TemplateTypeParmDecl *Param, TemplateArgumentLoc &Arg,
SmallVectorImpl<TemplateArgument> &SugaredConverted,
@@ -12783,6 +12792,18 @@ public:
void MarkUsedTemplateParameters(const Expr *E, bool OnlyDeduced,
unsigned Depth, llvm::SmallBitVector &Used);
+ /// Mark which template parameters are named in a given expression.
+ ///
+ /// Unlike MarkUsedTemplateParameters, this excludes parameter that
+ /// are used but not directly named by an expression - i.e. it excludes
+ /// any template parameter that denotes the type of a referenced NTTP.
+ ///
+ /// \param Used a bit vector whose elements will be set to \c true
+ /// to indicate when the corresponding template parameter will be
+ /// deduced.
+ void MarkUsedTemplateParametersForSubsumptionParameterMapping(
+ const Expr *E, unsigned Depth, llvm::SmallBitVector &Used);
+
/// Mark which template parameters can be deduced from a given
/// template argument list.
///
@@ -12799,6 +12820,9 @@ public:
void MarkUsedTemplateParameters(ArrayRef<TemplateArgument> TemplateArgs,
unsigned Depth, llvm::SmallBitVector &Used);
+ void MarkUsedTemplateParameters(ArrayRef<TemplateArgumentLoc> TemplateArgs,
+ unsigned Depth, llvm::SmallBitVector &Used);
+
void
MarkDeducedTemplateParameters(const FunctionTemplateDecl *FunctionTemplate,
llvm::SmallBitVector &Deduced) {
@@ -13096,6 +13120,9 @@ public:
/// Whether we're substituting into constraints.
bool InConstraintSubstitution;
+ /// Whether we're substituting into the parameter mapping of a constraint.
+ bool InParameterMappingSubstitution;
+
/// The point of instantiation or synthesis within the source code.
SourceLocation PointOfInstantiation;
@@ -13146,8 +13173,10 @@ public:
CodeSynthesisContext()
: Kind(TemplateInstantiation),
SavedInNonInstantiationSFINAEContext(false),
- InConstraintSubstitution(false), Entity(nullptr), Template(nullptr),
- TemplateArgs(nullptr), NumTemplateArgs(0), DeductionInfo(nullptr) {}
+ InConstraintSubstitution(false),
+ InParameterMappingSubstitution(false), Entity(nullptr),
+ Template(nullptr), TemplateArgs(nullptr), NumTemplateArgs(0),
+ DeductionInfo(nullptr) {}
/// Determines whether this template is an actual instantiation
/// that should be counted toward the maximum instantiation depth.
@@ -13359,6 +13388,11 @@ public:
const MultiLevelTemplateArgumentList &TemplateArgs,
TemplateArgumentListInfo &Outputs);
+ bool SubstTemplateArgumentsInParameterMapping(
+ ArrayRef<TemplateArgumentLoc> Args, SourceLocation BaseLoc,
+ const MultiLevelTemplateArgumentList &TemplateArgs,
+ TemplateArgumentListInfo &Out, bool BuildPackExpansionTypes);
+
/// Retrieve the template argument list(s) that should be used to
/// instantiate the definition of the given declaration.
///
@@ -13820,6 +13854,12 @@ public:
CodeSynthesisContexts.back().InConstraintSubstitution;
}
+ bool inParameterMappingSubstitution() const {
+ return !CodeSynthesisContexts.empty() &&
+ CodeSynthesisContexts.back().InParameterMappingSubstitution &&
+ !inConstraintSubstitution();
+ }
+
using EntityPrinter = llvm::function_ref<void(llvm::raw_ostream &)>;
/// \brief create a Requirement::SubstitutionDiagnostic with only a
@@ -14704,6 +14744,10 @@ public:
SatisfactionStack.swap(NewSS);
}
+ using ConstrainedDeclOrNestedRequirement =
+ llvm::PointerUnion<const NamedDecl *,
+ const concepts::NestedRequirement *>;
+
/// Check whether the given expression is a valid constraint expression.
/// A diagnostic is emitted if it is not, false is returned, and
/// PossibleNonPrimary will be set to true if the failure might be due to a
@@ -14728,44 +14772,12 @@ public:
/// \returns true if an error occurred and satisfaction could not be checked,
/// false otherwise.
bool CheckConstraintSatisfaction(
- const NamedDecl *Template,
+ ConstrainedDeclOrNestedRequirement Entity,
ArrayRef<AssociatedConstraint> AssociatedConstraints,
const MultiLevelTemplateArgumentList &TemplateArgLists,
- SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction) {
- llvm::SmallVector<Expr *, 4> Converted;
- return CheckConstraintSatisfaction(Template, AssociatedConstraints,
- Converted, TemplateArgLists,
- TemplateIDRange, Satisfaction);
- }
-
- /// \brief Check whether the given list of constraint expressions are
- /// satisfied (as if in a 'conjunction') given template arguments.
- /// Additionally, takes an empty list of Expressions which is populated with
- /// the instantiated versions of the ConstraintExprs.
- /// \param Template the template-like entity that triggered the constraints
- /// check (either a concept or a constrained entity).
- /// \param ConstraintExprs a list of constraint expressions, treated as if
- /// they were 'AND'ed together.
- /// \param ConvertedConstraints a out parameter that will get populated with
- /// the instantiated version of the ConstraintExprs if we successfully checked
- /// satisfaction.
- /// \param TemplateArgList the multi-level list of template arguments to
- /// substitute into the constraint expression. This should be relative to the
- /// top-level (hence multi-level), since we need to instantiate fully at the
- /// time of checking.
- /// \param TemplateIDRange The source range of the template id that
- /// caused the constraints check.
- /// \param Satisfaction if true is returned, will contain details of the
- /// satisfaction, with enough information to diagnose an unsatisfied
- /// expression.
- /// \returns true if an error occurred and satisfaction could not be checked,
- /// false otherwise.
- bool CheckConstraintSatisfaction(
- const NamedDecl *Template,
- ArrayRef<AssociatedConstraint> AssociatedConstraints,
- llvm::SmallVectorImpl<Expr *> &ConvertedConstraints,
- const MultiLevelTemplateArgumentList &TemplateArgList,
- SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction);
+ SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction,
+ const ConceptReference *TopLevelConceptId = nullptr,
+ Expr **ConvertedExpr = nullptr);
/// \brief Check whether the given non-dependent constraint expression is
/// satisfied. Returns false and updates Satisfaction with the satisfaction
@@ -14831,16 +14843,17 @@ public:
/// \param First whether this is the first time an unsatisfied constraint is
/// diagnosed for this error.
void DiagnoseUnsatisfiedConstraint(const ConstraintSatisfaction &Satisfaction,
+ SourceLocation Loc = {},
bool First = true);
/// \brief Emit diagnostics explaining why a constraint expression was deemed
/// unsatisfied.
void
- DiagnoseUnsatisfiedConstraint(const ASTConstraintSatisfaction &Satisfaction,
+ DiagnoseUnsatisfiedConstraint(const ConceptSpecializationExpr *ConstraintExpr,
bool First = true);
const NormalizedConstraint *getNormalizedAssociatedConstraints(
- const NamedDecl *ConstrainedDecl,
+ ConstrainedDeclOrNestedRequirement Entity,
ArrayRef<AssociatedConstraint> AssociatedConstraints);
/// \brief Check whether the given declaration's associated constraints are
@@ -14865,6 +14878,15 @@ public:
const NamedDecl *D1, ArrayRef<AssociatedConstraint> AC1,
const NamedDecl *D2, ArrayRef<AssociatedConstraint> AC2);
+ /// Cache the satisfaction of an atomic constraint.
+ /// The key is based on the unsubstituted expression and the parameter
+ /// mapping. This lets us not substituting the mapping more than once,
+ /// which is (very!) expensive.
+ /// FIXME: this should be private.
+ llvm::DenseMap<llvm::FoldingSetNodeID,
+ UnsubstitutedConstraintSatisfactionCacheResult>
+ UnsubstitutedConstraintSatisfactionCache;
+
private:
/// Caches pairs of template-like decls whose associated constraints were
/// checked for subsumption and whether or not the first's constraints did in
@@ -14875,8 +14897,11 @@ private:
/// constrained declarations). If an error occurred while normalizing the
/// associated constraints of the template or concept, nullptr will be cached
/// here.
- llvm::DenseMap<const NamedDecl *, NormalizedConstraint *> NormalizationCache;
+ llvm::DenseMap<ConstrainedDeclOrNestedRequirement, NormalizedConstraint *>
+ NormalizationCache;
+ /// Cache whether the associated constraint of a declaration
+ /// is satisfied.
llvm::ContextualFoldingSet<ConstraintSatisfaction, const ASTContext &>
SatisfactionCache;
diff --git a/clang/include/clang/Sema/SemaConcept.h b/clang/include/clang/Sema/SemaConcept.h
index 648a9c5..51ca1e1 100644
--- a/clang/include/clang/Sema/SemaConcept.h
+++ b/clang/include/clang/Sema/SemaConcept.h
@@ -16,130 +16,406 @@
#include "clang/AST/ASTContext.h"
#include "clang/AST/DeclTemplate.h"
#include "clang/AST/Expr.h"
+#include "clang/AST/ExprConcepts.h"
#include "clang/Basic/SourceLocation.h"
+#include "clang/Sema/Ownership.h"
#include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/PointerUnion.h"
#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallVector.h"
#include <optional>
#include <utility>
namespace clang {
class Sema;
+class MultiLevelTemplateArgumentList;
-enum { ConstraintAlignment = 8 };
+/// \brief A normalized constraint, as defined in C++ [temp.constr.normal], is
+/// either an atomic constraint, a conjunction of normalized constraints or a
+/// disjunction of normalized constraints.
+struct NormalizedConstraint {
+
+ enum class ConstraintKind : unsigned char {
+ Atomic = 0,
+ ConceptId,
+ FoldExpanded,
+ Compound,
+ };
+
+ enum CompoundConstraintKind : unsigned char {
+ CCK_Conjunction,
+ CCK_Disjunction
+ };
+ enum class FoldOperatorKind : unsigned char { And, Or };
+
+ using OccurenceList = llvm::SmallBitVector;
+
+protected:
+ using ExprOrConcept =
+ llvm::PointerUnion<const Expr *, const ConceptReference *>;
+
+ struct AtomicConstraintBits {
+ // Kind is the first member of all union members,
+ // as we rely on their initial common sequence.
+ LLVM_PREFERRED_TYPE(ConstraintKind)
+ unsigned Kind : 5;
+ unsigned Placeholder : 1;
+ unsigned PackSubstitutionIndex : 26;
+ // Indexes, IndexesForSubsumption, and Args are part of the common initial
+ // sequences of constraints that do have a mapping.
+
+ // Indexes of the parameters used in a constraint expression.
+ OccurenceList Indexes;
+ // Indexes of the parameters named directly in a constraint expression.
+ // FIXME: we should try to reduce the size of this struct?
+ OccurenceList IndexesForSubsumption;
+
+ TemplateArgumentLoc *Args;
+ TemplateParameterList *ParamList;
+ ExprOrConcept ConstraintExpr;
+ const NamedDecl *ConstraintDecl;
+ };
+
+ struct FoldExpandedConstraintBits {
+ LLVM_PREFERRED_TYPE(ConstraintKind)
+ unsigned Kind : 5;
+ LLVM_PREFERRED_TYPE(FoldOperatorKind)
+ unsigned FoldOperator : 1;
+ unsigned Placeholder : 26;
+ OccurenceList Indexes;
+ OccurenceList IndexesForSubsumption;
+ TemplateArgumentLoc *Args;
+ TemplateParameterList *ParamList;
+ const Expr *Pattern;
+ const NamedDecl *ConstraintDecl;
+ NormalizedConstraint *Constraint;
+ };
+
+ struct ConceptIdBits : AtomicConstraintBits {
+ NormalizedConstraint *Sub;
+
+ // Only used for parameter mapping.
+ const ConceptSpecializationExpr *CSE;
+ };
+
+ struct CompoundConstraintBits {
+ LLVM_PREFERRED_TYPE(ConstraintKind)
+ unsigned Kind : 5;
+ LLVM_PREFERRED_TYPE(CompoundConstraintKind)
+ unsigned CCK : 1;
+ NormalizedConstraint *LHS;
+ NormalizedConstraint *RHS;
+ };
+
+ union {
+ AtomicConstraintBits Atomic;
+ FoldExpandedConstraintBits FoldExpanded;
+ ConceptIdBits ConceptId;
+ CompoundConstraintBits Compound;
+ };
+
+ ~NormalizedConstraint() {
+ if (getKind() != ConstraintKind::Compound)
+ Atomic.Indexes.llvm::SmallBitVector::~SmallBitVector();
+ }
+
+ NormalizedConstraint(const Expr *ConstraintExpr,
+ const NamedDecl *ConstraintDecl,
+ UnsignedOrNone PackIndex)
+ : Atomic{llvm::to_underlying(ConstraintKind::Atomic),
+ /*Placeholder=*/0,
+ PackIndex.toInternalRepresentation(),
+ /*Indexes=*/{},
+ /*IndexesForSubsumption=*/{},
+ /*Args=*/nullptr,
+ /*ParamList=*/nullptr,
+ ConstraintExpr,
+ ConstraintDecl} {}
+
+ NormalizedConstraint(const Expr *Pattern, FoldOperatorKind OpKind,
+ NormalizedConstraint *Constraint,
+ const NamedDecl *ConstraintDecl)
+ : FoldExpanded{llvm::to_underlying(ConstraintKind::FoldExpanded),
+ llvm::to_underlying(OpKind),
+ /*Placeholder=*/0,
+ /*Indexes=*/{},
+ /*IndexesForSubsumption=*/{},
+ /*Args=*/nullptr,
+ /*ParamList=*/nullptr,
+ Pattern,
+ ConstraintDecl,
+ Constraint} {}
+
+ NormalizedConstraint(const ConceptReference *ConceptId,
+ const NamedDecl *ConstraintDecl,
+ NormalizedConstraint *SubConstraint,
+ const ConceptSpecializationExpr *CSE,
+ UnsignedOrNone PackIndex)
+ : ConceptId{{llvm::to_underlying(ConstraintKind::ConceptId),
+ /*Placeholder=*/0, PackIndex.toInternalRepresentation(),
+ /*Indexes=*/{},
+ /*IndexesForSubsumption=*/{},
+ /*Args=*/nullptr, /*ParamList=*/nullptr, ConceptId,
+ ConstraintDecl},
+ SubConstraint,
+ CSE} {}
+
+ NormalizedConstraint(NormalizedConstraint *LHS, CompoundConstraintKind CCK,
+ NormalizedConstraint *RHS)
+ : Compound{llvm::to_underlying(ConstraintKind::Compound),
+ llvm::to_underlying(CCK), LHS, RHS} {}
+
+ bool hasParameterMapping() const {
+ // compound constraints do not have a mapping
+ // and Args is not part of their common initial sequence.
+ return getKind() != ConstraintKind::Compound && Atomic.Args != nullptr;
+ }
+
+ const OccurenceList &mappingOccurenceList() const {
+ assert(hasParameterMapping() && "This constraint has no parameter mapping");
+ return Atomic.Indexes;
+ }
+
+ const OccurenceList &mappingOccurenceListForSubsumption() const {
+ assert(hasParameterMapping() && "This constraint has no parameter mapping");
+ return Atomic.IndexesForSubsumption;
+ }
-struct alignas(ConstraintAlignment) AtomicConstraint {
- const Expr *ConstraintExpr;
- const NamedDecl *ConstraintDecl;
- std::optional<ArrayRef<TemplateArgumentLoc>> ParameterMapping;
+ llvm::MutableArrayRef<TemplateArgumentLoc> getParameterMapping() const {
+ return {Atomic.Args, Atomic.Indexes.count()};
+ }
+
+ TemplateParameterList *getUsedTemplateParamList() const {
+ return Atomic.ParamList;
+ }
- AtomicConstraint(const Expr *ConstraintExpr, const NamedDecl *ConstraintDecl)
- : ConstraintExpr(ConstraintExpr), ConstraintDecl(ConstraintDecl) {};
+ void updateParameterMapping(OccurenceList Indexes,
+ OccurenceList IndexesForSubsumption,
+ llvm::MutableArrayRef<TemplateArgumentLoc> Args,
+ TemplateParameterList *ParamList) {
+ assert(getKind() != ConstraintKind::Compound);
+ assert(Indexes.count() == Args.size());
+ assert(IndexesForSubsumption.size() == Indexes.size());
+ assert((Indexes | IndexesForSubsumption) == Indexes);
+
+ Atomic.IndexesForSubsumption = std::move(IndexesForSubsumption);
+ Atomic.Indexes = std::move(Indexes);
+ Atomic.Args = Args.data();
+ Atomic.ParamList = ParamList;
+ }
bool hasMatchingParameterMapping(ASTContext &C,
- const AtomicConstraint &Other) const {
- if (!ParameterMapping != !Other.ParameterMapping)
+ const NormalizedConstraint &Other) const {
+ assert(getKind() != ConstraintKind::Compound);
+
+ if (hasParameterMapping() != Other.hasParameterMapping())
return false;
- if (!ParameterMapping)
+ if (!hasParameterMapping())
return true;
- if (ParameterMapping->size() != Other.ParameterMapping->size())
- return false;
- for (unsigned I = 0, S = ParameterMapping->size(); I < S; ++I) {
+ llvm::ArrayRef<TemplateArgumentLoc> ParameterMapping =
+ getParameterMapping();
+ llvm::ArrayRef<TemplateArgumentLoc> OtherParameterMapping =
+ Other.getParameterMapping();
+
+ const OccurenceList &Indexes = mappingOccurenceListForSubsumption();
+ const OccurenceList &OtherIndexes =
+ Other.mappingOccurenceListForSubsumption();
+
+ if (ParameterMapping.size() != OtherParameterMapping.size())
+ return false;
+ for (unsigned I = 0, S = ParameterMapping.size(); I < S; ++I) {
+ if (Indexes[I] != OtherIndexes[I])
+ return false;
+ if (!Indexes[I])
+ continue;
llvm::FoldingSetNodeID IDA, IDB;
- C.getCanonicalTemplateArgument((*ParameterMapping)[I].getArgument())
+ C.getCanonicalTemplateArgument(ParameterMapping[I].getArgument())
.Profile(IDA, C);
- C.getCanonicalTemplateArgument((*Other.ParameterMapping)[I].getArgument())
+ C.getCanonicalTemplateArgument(OtherParameterMapping[I].getArgument())
.Profile(IDB, C);
if (IDA != IDB)
return false;
}
return true;
}
-};
-struct alignas(ConstraintAlignment) NormalizedConstraintPair;
-struct alignas(ConstraintAlignment) FoldExpandedConstraint;
+public:
+ ConstraintKind getKind() const {
+ return static_cast<ConstraintKind>(Atomic.Kind);
+ }
-/// \brief A normalized constraint, as defined in C++ [temp.constr.normal], is
-/// either an atomic constraint, a conjunction of normalized constraints or a
-/// disjunction of normalized constraints.
-struct NormalizedConstraint {
+ SourceLocation getBeginLoc() const {
+ switch (getKind()) {
+ case ConstraintKind::Atomic:
+ return cast<const Expr *>(Atomic.ConstraintExpr)->getBeginLoc();
+ case ConstraintKind::ConceptId:
+ return cast<const ConceptReference *>(Atomic.ConstraintExpr)
+ ->getBeginLoc();
+ case ConstraintKind::Compound:
+ return Compound.LHS->getBeginLoc();
+ case ConstraintKind::FoldExpanded:
+ return FoldExpanded.Pattern->getBeginLoc();
+ }
+ }
+
+ SourceLocation getEndLoc() const {
+ switch (getKind()) {
+ case ConstraintKind::Atomic:
+ return cast<const Expr *>(Atomic.ConstraintExpr)->getEndLoc();
+ case ConstraintKind::ConceptId:
+ return cast<const ConceptReference *>(Atomic.ConstraintExpr)->getEndLoc();
+ case ConstraintKind::Compound:
+ return Compound.RHS->getEndLoc();
+ case ConstraintKind::FoldExpanded:
+ return FoldExpanded.Pattern->getEndLoc();
+ }
+ }
+
+ SourceRange getSourceRange() const { return {getBeginLoc(), getEndLoc()}; }
+
+private:
friend class Sema;
+ static NormalizedConstraint *
+ fromAssociatedConstraints(Sema &S, const NamedDecl *D,
+ ArrayRef<AssociatedConstraint> ACs);
+ static NormalizedConstraint *fromConstraintExpr(Sema &S, const NamedDecl *D,
+ const Expr *E,
+ UnsignedOrNone SubstIndex);
+};
+
+class CompoundConstraint : public NormalizedConstraint {
+ using NormalizedConstraint::NormalizedConstraint;
- enum CompoundConstraintKind { CCK_Conjunction, CCK_Disjunction };
+public:
+ static CompoundConstraint *Create(ASTContext &Ctx, NormalizedConstraint *LHS,
+ CompoundConstraintKind CCK,
+ NormalizedConstraint *RHS) {
+ return new (Ctx) CompoundConstraint(LHS, CCK, RHS);
+ }
- using CompoundConstraint = llvm::PointerIntPair<NormalizedConstraintPair *, 1,
- CompoundConstraintKind>;
+ static CompoundConstraint *CreateConjunction(ASTContext &Ctx,
+ NormalizedConstraint *LHS,
+ NormalizedConstraint *RHS) {
+ return new (Ctx) CompoundConstraint(LHS, CCK_Conjunction, RHS);
+ }
- llvm::PointerUnion<AtomicConstraint *, FoldExpandedConstraint *,
- CompoundConstraint>
- Constraint;
+ const NormalizedConstraint &getLHS() const { return *Compound.LHS; }
- NormalizedConstraint(AtomicConstraint *C): Constraint{C} { };
- NormalizedConstraint(FoldExpandedConstraint *C) : Constraint{C} {};
+ NormalizedConstraint &getLHS() { return *Compound.LHS; }
- NormalizedConstraint(ASTContext &C, NormalizedConstraint LHS,
- NormalizedConstraint RHS, CompoundConstraintKind Kind);
+ const NormalizedConstraint &getRHS() const { return *Compound.RHS; }
- NormalizedConstraint(ASTContext &C, const NormalizedConstraint &Other);
- NormalizedConstraint(NormalizedConstraint &&Other):
- Constraint(Other.Constraint) {
- Other.Constraint = nullptr;
+ NormalizedConstraint &getRHS() { return *Compound.RHS; }
+
+ CompoundConstraintKind getCompoundKind() const {
+ return static_cast<CompoundConstraintKind>(Compound.CCK);
}
- NormalizedConstraint &operator=(const NormalizedConstraint &Other) = delete;
- NormalizedConstraint &operator=(NormalizedConstraint &&Other) {
- if (&Other != this) {
- NormalizedConstraint Temp(std::move(Other));
- std::swap(Constraint, Temp.Constraint);
- }
- return *this;
+};
+
+class NormalizedConstraintWithParamMapping : public NormalizedConstraint {
+protected:
+ using NormalizedConstraint::NormalizedConstraint;
+
+public:
+ using NormalizedConstraint::getParameterMapping;
+ using NormalizedConstraint::getUsedTemplateParamList;
+ using NormalizedConstraint::hasMatchingParameterMapping;
+ using NormalizedConstraint::hasParameterMapping;
+ using NormalizedConstraint::mappingOccurenceList;
+ using NormalizedConstraint::mappingOccurenceListForSubsumption;
+ using NormalizedConstraint::updateParameterMapping;
+
+ const NamedDecl *getConstraintDecl() const { return Atomic.ConstraintDecl; }
+
+ UnsignedOrNone getPackSubstitutionIndex() const {
+ return UnsignedOrNone::fromInternalRepresentation(
+ Atomic.PackSubstitutionIndex);
}
+};
+
+class AtomicConstraint : public NormalizedConstraintWithParamMapping {
+ using NormalizedConstraintWithParamMapping::
+ NormalizedConstraintWithParamMapping;
- bool isAtomic() const { return llvm::isa<AtomicConstraint *>(Constraint); }
- bool isFoldExpanded() const {
- return llvm::isa<FoldExpandedConstraint *>(Constraint);
+public:
+ static AtomicConstraint *Create(ASTContext &Ctx, const Expr *ConstraintExpr,
+ const NamedDecl *ConstraintDecl,
+ UnsignedOrNone PackIndex) {
+ return new (Ctx)
+ AtomicConstraint(ConstraintExpr, ConstraintDecl, PackIndex);
}
- bool isCompound() const { return llvm::isa<CompoundConstraint>(Constraint); }
- CompoundConstraintKind getCompoundKind() const;
+ const Expr *getConstraintExpr() const {
+ return cast<const Expr *>(Atomic.ConstraintExpr);
+ }
+};
- NormalizedConstraint &getLHS() const;
- NormalizedConstraint &getRHS() const;
+class FoldExpandedConstraint : public NormalizedConstraintWithParamMapping {
+ using NormalizedConstraintWithParamMapping::
+ NormalizedConstraintWithParamMapping;
- AtomicConstraint *getAtomicConstraint() const;
+public:
+ static FoldExpandedConstraint *Create(ASTContext &Ctx, const Expr *Pattern,
+ const NamedDecl *ConstraintDecl,
+ FoldOperatorKind OpKind,
+ NormalizedConstraint *Constraint) {
+ return new (Ctx)
+ FoldExpandedConstraint(Pattern, OpKind, Constraint, ConstraintDecl);
+ }
- FoldExpandedConstraint *getFoldExpandedConstraint() const;
+ using NormalizedConstraint::hasMatchingParameterMapping;
-private:
- static std::optional<NormalizedConstraint>
- fromAssociatedConstraints(Sema &S, const NamedDecl *D,
- ArrayRef<AssociatedConstraint> ACs);
- static std::optional<NormalizedConstraint>
- fromConstraintExpr(Sema &S, const NamedDecl *D, const Expr *E);
-};
+ FoldOperatorKind getFoldOperator() const {
+ return static_cast<FoldOperatorKind>(FoldExpanded.FoldOperator);
+ }
-struct alignas(ConstraintAlignment) NormalizedConstraintPair {
- NormalizedConstraint LHS, RHS;
-};
+ const Expr *getPattern() const { return FoldExpanded.Pattern; }
-struct alignas(ConstraintAlignment) FoldExpandedConstraint {
- enum class FoldOperatorKind { And, Or } Kind;
- NormalizedConstraint Constraint;
- const Expr *Pattern;
+ const NormalizedConstraint &getNormalizedPattern() const {
+ return *FoldExpanded.Constraint;
+ }
- FoldExpandedConstraint(FoldOperatorKind K, NormalizedConstraint C,
- const Expr *Pattern)
- : Kind(K), Constraint(std::move(C)), Pattern(Pattern) {};
+ NormalizedConstraint &getNormalizedPattern() {
+ return *FoldExpanded.Constraint;
+ }
static bool AreCompatibleForSubsumption(const FoldExpandedConstraint &A,
const FoldExpandedConstraint &B);
};
-const NormalizedConstraint *getNormalizedAssociatedConstraints(
- Sema &S, const NamedDecl *ConstrainedDecl,
- ArrayRef<AssociatedConstraint> AssociatedConstraints);
+class ConceptIdConstraint : public NormalizedConstraintWithParamMapping {
+ using NormalizedConstraintWithParamMapping::
+ NormalizedConstraintWithParamMapping;
+
+public:
+ static ConceptIdConstraint *
+ Create(ASTContext &Ctx, const ConceptReference *ConceptId,
+ NormalizedConstraint *SubConstraint, const NamedDecl *ConstraintDecl,
+ const ConceptSpecializationExpr *CSE, UnsignedOrNone PackIndex) {
+ return new (Ctx) ConceptIdConstraint(ConceptId, ConstraintDecl,
+ SubConstraint, CSE, PackIndex);
+ }
+
+ const ConceptSpecializationExpr *getConceptSpecializationExpr() const {
+ return ConceptId.CSE;
+ }
+
+ const ConceptReference *getConceptId() const {
+ return cast<const ConceptReference *>(ConceptId.ConstraintExpr);
+ }
+
+ const NormalizedConstraint &getNormalizedConstraint() const {
+ return *ConceptId.Sub;
+ }
+
+ NormalizedConstraint &getNormalizedConstraint() { return *ConceptId.Sub; }
+};
+
+struct UnsubstitutedConstraintSatisfactionCacheResult {
+ ExprResult SubstExpr;
+ ConstraintSatisfaction Satisfaction;
+};
/// \brief SubsumptionChecker establishes subsumption
/// between two set of constraints.
@@ -189,13 +465,13 @@ private:
};
struct MappedAtomicConstraint {
- AtomicConstraint *Constraint;
+ const AtomicConstraint *Constraint;
Literal ID;
};
struct FoldExpendedConstraintKey {
FoldExpandedConstraint::FoldOperatorKind Kind;
- AtomicConstraint *Constraint;
+ const AtomicConstraint *Constraint;
Literal ID;
};
@@ -207,7 +483,7 @@ private:
// A map from a literal to a corresponding associated constraint.
// We do not have enough bits left for a pointer union here :(
- llvm::DenseMap<uint16_t, void *> ReverseMap;
+ llvm::DenseMap<uint16_t, const void *> ReverseMap;
// Fold expanded constraints ask us to recursively establish subsumption.
// This caches the result.
@@ -234,12 +510,12 @@ private:
FormulaType Normalize(const NormalizedConstraint &C);
void AddUniqueClauseToFormula(Formula &F, Clause C);
- Literal find(AtomicConstraint *);
- Literal find(FoldExpandedConstraint *);
+ Literal find(const AtomicConstraint *);
+ Literal find(const FoldExpandedConstraint *);
uint16_t getNewLiteralId();
};
-} // clang
+} // namespace clang
#endif // LLVM_CLANG_SEMA_SEMACONCEPT_H
diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h
index 115c19d..60c7d27 100644
--- a/clang/include/clang/Sema/Template.h
+++ b/clang/include/clang/Sema/Template.h
@@ -234,21 +234,25 @@ enum class TemplateSubstitutionKind : char {
/// Replaces the current 'innermost' level with the provided argument list.
/// This is useful for type deduction cases where we need to get the entire
/// list from the AST, but then add the deduced innermost list.
- void replaceInnermostTemplateArguments(Decl *AssociatedDecl, ArgList Args) {
+ void replaceInnermostTemplateArguments(Decl *AssociatedDecl, ArgList Args,
+ bool Final = false) {
assert((!TemplateArgumentLists.empty() || NumRetainedOuterLevels) &&
"Replacing in an empty list?");
if (!TemplateArgumentLists.empty()) {
- assert((TemplateArgumentLists[0].AssociatedDeclAndFinal.getPointer() ||
- TemplateArgumentLists[0].AssociatedDeclAndFinal.getPointer() ==
- AssociatedDecl) &&
- "Trying to change incorrect declaration?");
TemplateArgumentLists[0].Args = Args;
- } else {
- --NumRetainedOuterLevels;
- TemplateArgumentLists.push_back(
- {{AssociatedDecl, /*Final=*/false}, Args});
+ return;
}
+ --NumRetainedOuterLevels;
+ TemplateArgumentLists.push_back(
+ {{AssociatedDecl, /*Final=*/Final}, Args});
+ }
+
+ void replaceOutermostTemplateArguments(Decl *AssociatedDecl, ArgList Args) {
+ assert((!TemplateArgumentLists.empty()) && "Replacing in an empty list?");
+ TemplateArgumentLists.back().AssociatedDeclAndFinal.setPointer(
+ AssociatedDecl);
+ TemplateArgumentLists.back().Args = Args;
}
/// Add an outermost level that we are not substituting. We have no
diff --git a/clang/lib/AST/ASTConcept.cpp b/clang/lib/AST/ASTConcept.cpp
index d658890..fd12bc4 100644
--- a/clang/lib/AST/ASTConcept.cpp
+++ b/clang/lib/AST/ASTConcept.cpp
@@ -24,13 +24,18 @@ static void
CreateUnsatisfiedConstraintRecord(const ASTContext &C,
const UnsatisfiedConstraintRecord &Detail,
UnsatisfiedConstraintRecord *TrailingObject) {
- if (auto *E = dyn_cast<Expr *>(Detail))
+ if (Detail.isNull())
+ new (TrailingObject) UnsatisfiedConstraintRecord(nullptr);
+ else if (const auto *E = llvm::dyn_cast<const Expr *>(Detail))
new (TrailingObject) UnsatisfiedConstraintRecord(E);
+ else if (const auto *Concept =
+ llvm::dyn_cast<const ConceptReference *>(Detail))
+ new (TrailingObject) UnsatisfiedConstraintRecord(Concept);
else {
auto &SubstitutionDiagnostic =
- *cast<std::pair<SourceLocation, StringRef> *>(Detail);
+ *cast<const clang::ConstraintSubstitutionDiagnostic *>(Detail);
StringRef Message = C.backupStr(SubstitutionDiagnostic.second);
- auto *NewSubstDiag = new (C) std::pair<SourceLocation, StringRef>(
+ auto *NewSubstDiag = new (C) clang::ConstraintSubstitutionDiagnostic(
SubstitutionDiagnostic.first, Message);
new (TrailingObject) UnsatisfiedConstraintRecord(NewSubstDiag);
}
@@ -74,9 +79,10 @@ ASTConstraintSatisfaction *ASTConstraintSatisfaction::Rebuild(
return new (Mem) ASTConstraintSatisfaction(C, Satisfaction);
}
-void ConstraintSatisfaction::Profile(
- llvm::FoldingSetNodeID &ID, const ASTContext &C,
- const NamedDecl *ConstraintOwner, ArrayRef<TemplateArgument> TemplateArgs) {
+void ConstraintSatisfaction::Profile(llvm::FoldingSetNodeID &ID,
+ const ASTContext &C,
+ const NamedDecl *ConstraintOwner,
+ ArrayRef<TemplateArgument> TemplateArgs) {
ID.AddPointer(ConstraintOwner);
ID.AddInteger(TemplateArgs.size());
for (auto &Arg : TemplateArgs)
@@ -116,6 +122,19 @@ void ConceptReference::print(llvm::raw_ostream &OS,
}
}
+const StreamingDiagnostic &clang::operator<<(const StreamingDiagnostic &DB,
+ const ConceptReference *C) {
+ std::string NameStr;
+ llvm::raw_string_ostream OS(NameStr);
+ LangOptions LO;
+ LO.CPlusPlus = true;
+ LO.Bool = true;
+ OS << '\'';
+ C->print(OS, PrintingPolicy(LO));
+ OS << '\'';
+ return DB << NameStr;
+}
+
concepts::ExprRequirement::ExprRequirement(
Expr *E, bool IsSimple, SourceLocation NoexceptLoc,
ReturnTypeRequirement Req, SatisfactionStatus Status,
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 1c8fd83..f43fa8c 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -1069,22 +1069,22 @@ Error ASTNodeImporter::ImportConstraintSatisfaction(
ToSat.ContainsErrors = FromSat.ContainsErrors;
if (!ToSat.IsSatisfied) {
for (auto Record = FromSat.begin(); Record != FromSat.end(); ++Record) {
- if (Expr *E = Record->dyn_cast<Expr *>()) {
+ if (const Expr *E = Record->dyn_cast<const Expr *>()) {
ExpectedExpr ToSecondExpr = import(E);
if (!ToSecondExpr)
return ToSecondExpr.takeError();
ToSat.Details.emplace_back(ToSecondExpr.get());
} else {
- auto Pair = Record->dyn_cast<std::pair<SourceLocation, StringRef> *>();
+ auto Pair =
+ Record->dyn_cast<const ConstraintSubstitutionDiagnostic *>();
ExpectedSLoc ToPairFirst = import(Pair->first);
if (!ToPairFirst)
return ToPairFirst.takeError();
StringRef ToPairSecond = ImportASTStringRef(Pair->second);
- ToSat.Details.emplace_back(
- new (Importer.getToContext())
- ConstraintSatisfaction::SubstitutionDiagnostic{
- ToPairFirst.get(), ToPairSecond});
+ ToSat.Details.emplace_back(new (Importer.getToContext())
+ ConstraintSubstitutionDiagnostic{
+ ToPairFirst.get(), ToPairSecond});
}
}
}
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index dc6d232..8413090 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -12,9 +12,11 @@
#include "clang/Sema/SemaConcept.h"
#include "TreeTransform.h"
+#include "clang/AST/ASTConcept.h"
#include "clang/AST/ASTLambda.h"
#include "clang/AST/DeclCXX.h"
#include "clang/AST/ExprConcepts.h"
+#include "clang/AST/RecursiveASTVisitor.h"
#include "clang/Basic/OperatorPrecedence.h"
#include "clang/Sema/EnterExpressionEvaluationContext.h"
#include "clang/Sema/Initialization.h"
@@ -27,7 +29,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/PointerUnion.h"
#include "llvm/ADT/StringExtras.h"
-#include <optional>
+#include "llvm/Support/SaveAndRestore.h"
using namespace clang;
using namespace sema;
@@ -85,7 +87,7 @@ public:
OK_Ordinary, Loc, FPOptionsOverride{});
}
};
-}
+} // namespace
bool Sema::CheckConstraintExpression(const Expr *ConstraintExpression,
Token NextToken, bool *PossibleNonPrimary,
@@ -146,14 +148,14 @@ bool Sema::CheckConstraintExpression(const Expr *ConstraintExpression,
if (!Context.hasSameUnqualifiedType(Type, Context.BoolTy)) {
Diag(ConstraintExpression->getExprLoc(),
- diag::err_non_bool_atomic_constraint) << Type
- << ConstraintExpression->getSourceRange();
+ diag::err_non_bool_atomic_constraint)
+ << Type << ConstraintExpression->getSourceRange();
CheckForNonPrimary();
return false;
}
if (PossibleNonPrimary)
- *PossibleNonPrimary = false;
+ *PossibleNonPrimary = false;
return true;
}
@@ -164,52 +166,315 @@ struct SatisfactionStackRAII {
SatisfactionStackRAII(Sema &SemaRef, const NamedDecl *ND,
const llvm::FoldingSetNodeID &FSNID)
: SemaRef(SemaRef) {
- if (ND) {
+ if (ND) {
SemaRef.PushSatisfactionStackEntry(ND, FSNID);
Inserted = true;
- }
+ }
}
~SatisfactionStackRAII() {
- if (Inserted)
- SemaRef.PopSatisfactionStackEntry();
+ if (Inserted)
+ SemaRef.PopSatisfactionStackEntry();
}
};
} // namespace
-static bool
-DiagRecursiveConstraintEval(Sema &S, llvm::FoldingSetNodeID &ID,
- const NamedDecl *Templ, const Expr *E,
- const MultiLevelTemplateArgumentList &MLTAL) {
+static bool DiagRecursiveConstraintEval(
+ Sema &S, llvm::FoldingSetNodeID &ID, const NamedDecl *Templ, const Expr *E,
+ const MultiLevelTemplateArgumentList *MLTAL = nullptr) {
E->Profile(ID, S.Context, /*Canonical=*/true);
- for (const auto &List : MLTAL)
- for (const auto &TemplateArg : List.Args)
- TemplateArg.Profile(ID, S.Context);
-
- // Note that we have to do this with our own collection, because there are
- // times where a constraint-expression check can cause us to need to evaluate
- // other constriants that are unrelated, such as when evaluating a recovery
- // expression, or when trying to determine the constexpr-ness of special
- // members. Otherwise we could just use the
- // Sema::InstantiatingTemplate::isAlreadyBeingInstantiated function.
+ if (MLTAL) {
+ for (const auto &List : *MLTAL)
+ for (const auto &TemplateArg : List.Args)
+ S.Context.getCanonicalTemplateArgument(TemplateArg)
+ .Profile(ID, S.Context);
+ }
if (S.SatisfactionStackContains(Templ, ID)) {
S.Diag(E->getExprLoc(), diag::err_constraint_depends_on_self)
<< E << E->getSourceRange();
return true;
}
-
return false;
}
-static ExprResult EvaluateAtomicConstraint(
- Sema &S, const Expr *AtomicExpr, const NamedDecl *Template,
- SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
- ConstraintSatisfaction &Satisfaction) {
+// Figure out the to-translation-unit depth for this function declaration for
+// the purpose of seeing if they differ by constraints. This isn't the same as
+// getTemplateDepth, because it includes already instantiated parents.
+static unsigned
+CalculateTemplateDepthForConstraints(Sema &S, const NamedDecl *ND,
+ bool SkipForSpecialization = false) {
+ MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
+ ND, ND->getLexicalDeclContext(), /*Final=*/false,
+ /*Innermost=*/std::nullopt,
+ /*RelativeToPrimary=*/true,
+ /*Pattern=*/nullptr,
+ /*ForConstraintInstantiation=*/true, SkipForSpecialization);
+ return MLTAL.getNumLevels();
+}
+
+namespace {
+class AdjustConstraintDepth : public TreeTransform<AdjustConstraintDepth> {
+ unsigned TemplateDepth = 0;
+
+public:
+ using inherited = TreeTransform<AdjustConstraintDepth>;
+ AdjustConstraintDepth(Sema &SemaRef, unsigned TemplateDepth)
+ : inherited(SemaRef), TemplateDepth(TemplateDepth) {}
+
+ using inherited::TransformTemplateTypeParmType;
+ QualType TransformTemplateTypeParmType(TypeLocBuilder &TLB,
+ TemplateTypeParmTypeLoc TL, bool) {
+ const TemplateTypeParmType *T = TL.getTypePtr();
+
+ TemplateTypeParmDecl *NewTTPDecl = nullptr;
+ if (TemplateTypeParmDecl *OldTTPDecl = T->getDecl())
+ NewTTPDecl = cast_or_null<TemplateTypeParmDecl>(
+ TransformDecl(TL.getNameLoc(), OldTTPDecl));
+
+ QualType Result = getSema().Context.getTemplateTypeParmType(
+ T->getDepth() + TemplateDepth, T->getIndex(), T->isParameterPack(),
+ NewTTPDecl);
+ TemplateTypeParmTypeLoc NewTL = TLB.push<TemplateTypeParmTypeLoc>(Result);
+ NewTL.setNameLoc(TL.getNameLoc());
+ return Result;
+ }
+
+ bool AlreadyTransformed(QualType T) {
+ if (T.isNull())
+ return true;
+
+ if (T->isInstantiationDependentType() || T->isVariablyModifiedType() ||
+ T->containsUnexpandedParameterPack())
+ return false;
+ return true;
+ }
+};
+} // namespace
+
+namespace {
+
+// FIXME: Convert it to DynamicRecursiveASTVisitor
+class HashParameterMapping : public RecursiveASTVisitor<HashParameterMapping> {
+ using inherited = RecursiveASTVisitor<HashParameterMapping>;
+ friend inherited;
+
+ Sema &SemaRef;
+ const MultiLevelTemplateArgumentList &TemplateArgs;
+ llvm::FoldingSetNodeID &ID;
+ llvm::SmallVector<TemplateArgument, 10> UsedTemplateArgs;
+
+ UnsignedOrNone OuterPackSubstIndex;
+
+ TemplateArgument getPackSubstitutedTemplateArgument(TemplateArgument Arg) {
+ assert(*SemaRef.ArgPackSubstIndex < Arg.pack_size());
+ Arg = Arg.pack_begin()[*SemaRef.ArgPackSubstIndex];
+ if (Arg.isPackExpansion())
+ Arg = Arg.getPackExpansionPattern();
+ return Arg;
+ }
+
+ bool shouldVisitTemplateInstantiations() const { return true; }
+
+public:
+ HashParameterMapping(Sema &SemaRef,
+ const MultiLevelTemplateArgumentList &TemplateArgs,
+ llvm::FoldingSetNodeID &ID,
+ UnsignedOrNone OuterPackSubstIndex)
+ : SemaRef(SemaRef), TemplateArgs(TemplateArgs), ID(ID),
+ OuterPackSubstIndex(OuterPackSubstIndex) {}
+
+ bool VisitTemplateTypeParmType(TemplateTypeParmType *T) {
+ // A lambda expression can introduce template parameters that don't have
+ // corresponding template arguments yet.
+ if (T->getDepth() >= TemplateArgs.getNumLevels())
+ return true;
+
+ TemplateArgument Arg = TemplateArgs(T->getDepth(), T->getIndex());
+
+ if (T->isParameterPack() && SemaRef.ArgPackSubstIndex) {
+ assert(Arg.getKind() == TemplateArgument::Pack &&
+ "Missing argument pack");
+
+ Arg = getPackSubstitutedTemplateArgument(Arg);
+ }
+
+ UsedTemplateArgs.push_back(
+ SemaRef.Context.getCanonicalTemplateArgument(Arg));
+ return true;
+ }
+
+ bool VisitDeclRefExpr(DeclRefExpr *E) {
+ NamedDecl *D = E->getDecl();
+ NonTypeTemplateParmDecl *NTTP = dyn_cast<NonTypeTemplateParmDecl>(D);
+ if (!NTTP)
+ return TraverseDecl(D);
+
+ TemplateArgument Arg = TemplateArgs(NTTP->getDepth(), NTTP->getPosition());
+ if (NTTP->isParameterPack() && SemaRef.ArgPackSubstIndex) {
+ assert(Arg.getKind() == TemplateArgument::Pack &&
+ "Missing argument pack");
+ Arg = getPackSubstitutedTemplateArgument(Arg);
+ }
+
+ UsedTemplateArgs.push_back(
+ SemaRef.Context.getCanonicalTemplateArgument(Arg));
+ return true;
+ }
+
+ bool VisitTypedefType(TypedefType *TT) {
+ return inherited::TraverseType(TT->desugar());
+ }
+
+ bool TraverseDecl(Decl *D) {
+ if (auto *VD = dyn_cast<ValueDecl>(D))
+ return TraverseType(VD->getType());
+
+ return inherited::TraverseDecl(D);
+ }
+
+ bool TraverseTypeLoc(TypeLoc TL, bool TraverseQualifier = true) {
+ // We don't care about TypeLocs. So traverse Types instead.
+ return TraverseType(TL.getType(), TraverseQualifier);
+ }
+
+ bool TraverseTagType(const TagType *T, bool TraverseQualifier) {
+ // T's parent can be dependent while T doesn't have any template arguments.
+ // We should have already traversed its qualifier.
+ // FIXME: Add an assert to catch cases where we failed to profile the
+ // concept. assert(!T->isDependentType() && "We missed a case in profiling
+ // concepts!");
+ return true;
+ }
+
+ bool TraverseInjectedClassNameType(InjectedClassNameType *T,
+ bool TraverseQualifier) {
+ return TraverseTemplateArguments(T->getTemplateArgs(SemaRef.Context));
+ }
+
+ bool TraverseTemplateArgument(const TemplateArgument &Arg) {
+ if (!Arg.containsUnexpandedParameterPack() || Arg.isPackExpansion()) {
+ // Act as if we are fully expanding this pack, if it is a PackExpansion.
+ Sema::ArgPackSubstIndexRAII _1(SemaRef, std::nullopt);
+ llvm::SaveAndRestore<UnsignedOrNone> _2(OuterPackSubstIndex,
+ std::nullopt);
+ return inherited::TraverseTemplateArgument(Arg);
+ }
+
+ Sema::ArgPackSubstIndexRAII _1(SemaRef, OuterPackSubstIndex);
+ return inherited::TraverseTemplateArgument(Arg);
+ }
+
+ void VisitConstraint(const NormalizedConstraintWithParamMapping &Constraint) {
+ if (!Constraint.hasParameterMapping()) {
+ for (const auto &List : TemplateArgs)
+ for (const TemplateArgument &Arg : List.Args)
+ SemaRef.Context.getCanonicalTemplateArgument(Arg).Profile(
+ ID, SemaRef.Context);
+ return;
+ }
+
+ llvm::ArrayRef<TemplateArgumentLoc> Mapping =
+ Constraint.getParameterMapping();
+ for (auto &ArgLoc : Mapping) {
+ TemplateArgument Canonical =
+ SemaRef.Context.getCanonicalTemplateArgument(ArgLoc.getArgument());
+ // We don't want sugars to impede the profile of cache.
+ UsedTemplateArgs.push_back(Canonical);
+ TraverseTemplateArgument(Canonical);
+ }
+
+ for (auto &Used : UsedTemplateArgs) {
+ llvm::FoldingSetNodeID R;
+ Used.Profile(R, SemaRef.Context);
+ ID.AddNodeID(R);
+ }
+ }
+};
+
+class ConstraintSatisfactionChecker {
+ Sema &S;
+ const NamedDecl *Template;
+ SourceLocation TemplateNameLoc;
+ UnsignedOrNone PackSubstitutionIndex;
+
+ ConstraintSatisfaction &Satisfaction;
+
+private:
+ ExprResult
+ EvaluateAtomicConstraint(const Expr *AtomicExpr,
+ const MultiLevelTemplateArgumentList &MLTAL);
+
+ UnsignedOrNone EvaluateFoldExpandedConstraintSize(
+ const FoldExpandedConstraint &FE,
+ const MultiLevelTemplateArgumentList &MLTAL);
+
+ // XXX: It is SLOW! Use it very carefully.
+ std::optional<MultiLevelTemplateArgumentList> SubstitutionInTemplateArguments(
+ const NormalizedConstraintWithParamMapping &Constraint,
+ MultiLevelTemplateArgumentList MLTAL,
+ llvm::SmallVector<TemplateArgument> &SubstitutedOuterMost);
+
+ ExprResult EvaluateSlow(const AtomicConstraint &Constraint,
+ const MultiLevelTemplateArgumentList &MLTAL);
+
+ ExprResult Evaluate(const AtomicConstraint &Constraint,
+ const MultiLevelTemplateArgumentList &MLTAL);
+
+ ExprResult EvaluateSlow(const FoldExpandedConstraint &Constraint,
+ const MultiLevelTemplateArgumentList &MLTAL);
+
+ ExprResult Evaluate(const FoldExpandedConstraint &Constraint,
+ const MultiLevelTemplateArgumentList &MLTAL);
+
+ ExprResult EvaluateSlow(const ConceptIdConstraint &Constraint,
+ const MultiLevelTemplateArgumentList &MLTAL,
+ unsigned int Size);
+
+ ExprResult Evaluate(const ConceptIdConstraint &Constraint,
+ const MultiLevelTemplateArgumentList &MLTAL);
+
+ ExprResult Evaluate(const CompoundConstraint &Constraint,
+ const MultiLevelTemplateArgumentList &MLTAL);
+
+public:
+ ConstraintSatisfactionChecker(Sema &SemaRef, const NamedDecl *Template,
+ SourceLocation TemplateNameLoc,
+ UnsignedOrNone PackSubstitutionIndex,
+ ConstraintSatisfaction &Satisfaction)
+ : S(SemaRef), Template(Template), TemplateNameLoc(TemplateNameLoc),
+ PackSubstitutionIndex(PackSubstitutionIndex),
+ Satisfaction(Satisfaction) {}
+
+ ExprResult Evaluate(const NormalizedConstraint &Constraint,
+ const MultiLevelTemplateArgumentList &MLTAL);
+};
+
+StringRef allocateStringFromConceptDiagnostic(const Sema &S,
+ const PartialDiagnostic Diag) {
+ SmallString<128> DiagString;
+ DiagString = ": ";
+ Diag.EmitToString(S.getDiagnostics(), DiagString);
+ return S.getASTContext().backupStr(DiagString);
+}
+
+} // namespace
+
+ExprResult ConstraintSatisfactionChecker::EvaluateAtomicConstraint(
+ const Expr *AtomicExpr, const MultiLevelTemplateArgumentList &MLTAL) {
EnterExpressionEvaluationContext ConstantEvaluated(
S, Sema::ExpressionEvaluationContext::ConstantEvaluated,
Sema::ReuseLambdaContextDecl);
+ llvm::FoldingSetNodeID ID;
+ if (Template &&
+ DiagRecursiveConstraintEval(S, ID, Template, AtomicExpr, &MLTAL)) {
+ Satisfaction.IsSatisfied = false;
+ Satisfaction.ContainsErrors = true;
+ return ExprEmpty();
+ }
+ SatisfactionStackRAII StackRAII(S, Template, ID);
+
// Atomic constraint - substitute arguments and check satisfaction.
- ExprResult SubstitutedExpression;
+ ExprResult SubstitutedExpression = const_cast<Expr *>(AtomicExpr);
{
TemplateDeductionInfo Info(TemplateNameLoc);
Sema::InstantiatingTemplate Inst(
@@ -220,16 +485,6 @@ static ExprResult EvaluateAtomicConstraint(
if (Inst.isInvalid())
return ExprError();
- llvm::FoldingSetNodeID ID;
- if (Template &&
- DiagRecursiveConstraintEval(S, ID, Template, AtomicExpr, MLTAL)) {
- Satisfaction.IsSatisfied = false;
- Satisfaction.ContainsErrors = true;
- return ExprEmpty();
- }
-
- SatisfactionStackRAII StackRAII(S, Template, ID);
-
// We do not want error diagnostics escaping here.
Sema::SFINAETrap Trap(S);
SubstitutedExpression =
@@ -247,21 +502,16 @@ static ExprResult EvaluateAtomicConstraint(
PartialDiagnosticAt SubstDiag{SourceLocation(),
PartialDiagnostic::NullDiagnostic()};
Info.takeSFINAEDiagnostic(SubstDiag);
- // FIXME: Concepts: This is an unfortunate consequence of there
+ // FIXME: This is an unfortunate consequence of there
// being no serialization code for PartialDiagnostics and the fact
// that serializing them would likely take a lot more storage than
// just storing them as strings. We would still like, in the
// future, to serialize the proper PartialDiagnostic as serializing
// it as a string defeats the purpose of the diagnostic mechanism.
- SmallString<128> DiagString;
- DiagString = ": ";
- SubstDiag.second.EmitToString(S.getDiagnostics(), DiagString);
- unsigned MessageSize = DiagString.size();
- char *Mem = new (S.Context) char[MessageSize];
- memcpy(Mem, DiagString.c_str(), MessageSize);
Satisfaction.Details.emplace_back(
- new (S.Context) ConstraintSatisfaction::SubstitutionDiagnostic{
- SubstDiag.first, StringRef(Mem, MessageSize)});
+ new (S.Context) ConstraintSubstitutionDiagnostic{
+ SubstDiag.first,
+ allocateStringFromConceptDiagnostic(S, SubstDiag.second)});
Satisfaction.IsSatisfied = false;
return ExprEmpty();
}
@@ -289,216 +539,94 @@ static ExprResult EvaluateAtomicConstraint(
return SubstitutedExpression;
}
-static UnsignedOrNone EvaluateFoldExpandedConstraintSize(
- Sema &S, const CXXFoldExpr *FE, const NamedDecl *Template,
- SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
- ConstraintSatisfaction &Satisfaction) {
-
- // We should ignore errors in the presence of packs of different size.
- Sema::SFINAETrap Trap(S);
-
- Expr *Pattern = FE->getPattern();
+std::optional<MultiLevelTemplateArgumentList>
+ConstraintSatisfactionChecker::SubstitutionInTemplateArguments(
+ const NormalizedConstraintWithParamMapping &Constraint,
+ MultiLevelTemplateArgumentList MLTAL,
+ llvm::SmallVector<TemplateArgument> &SubstitutedOuterMost) {
- SmallVector<UnexpandedParameterPack, 2> Unexpanded;
- S.collectUnexpandedParameterPacks(Pattern, Unexpanded);
- assert(!Unexpanded.empty() && "Pack expansion without parameter packs?");
- bool Expand = true;
- bool RetainExpansion = false;
- UnsignedOrNone NumExpansions = FE->getNumExpansions();
- if (S.CheckParameterPacksForExpansion(
- FE->getEllipsisLoc(), Pattern->getSourceRange(), Unexpanded, MLTAL,
- /*FailOnPackProducingTemplates=*/true, Expand, RetainExpansion,
- NumExpansions) ||
- !Expand || RetainExpansion)
- return std::nullopt;
+ if (!Constraint.hasParameterMapping())
+ return std::move(MLTAL);
- if (NumExpansions && S.getLangOpts().BracketDepth < *NumExpansions) {
- S.Diag(FE->getEllipsisLoc(),
- clang::diag::err_fold_expression_limit_exceeded)
- << *NumExpansions << S.getLangOpts().BracketDepth
- << FE->getSourceRange();
- S.Diag(FE->getEllipsisLoc(), diag::note_bracket_depth);
+ TemplateDeductionInfo Info(Constraint.getBeginLoc());
+ Sema::InstantiatingTemplate Inst(
+ S, Constraint.getBeginLoc(),
+ Sema::InstantiatingTemplate::ConstraintSubstitution{},
+ // FIXME: improve const-correctness of InstantiatingTemplate
+ const_cast<NamedDecl *>(Template), Info, Constraint.getSourceRange());
+ if (Inst.isInvalid())
return std::nullopt;
- }
- return NumExpansions;
-}
-
-static ExprResult calculateConstraintSatisfaction(
- Sema &S, const Expr *ConstraintExpr, const NamedDecl *Template,
- SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
- ConstraintSatisfaction &Satisfaction);
-
-static ExprResult calculateConstraintSatisfaction(
- Sema &S, const Expr *LHS, OverloadedOperatorKind Op, const Expr *RHS,
- const NamedDecl *Template, SourceLocation TemplateNameLoc,
- const MultiLevelTemplateArgumentList &MLTAL,
- ConstraintSatisfaction &Satisfaction) {
- size_t EffectiveDetailEndIndex = Satisfaction.Details.size();
-
- ExprResult LHSRes = calculateConstraintSatisfaction(
- S, LHS, Template, TemplateNameLoc, MLTAL, Satisfaction);
-
- if (LHSRes.isInvalid())
- return ExprError();
-
- bool IsLHSSatisfied = Satisfaction.IsSatisfied;
-
- if (Op == clang::OO_PipePipe && IsLHSSatisfied)
- // [temp.constr.op] p3
- // A disjunction is a constraint taking two operands. To determine if
- // a disjunction is satisfied, the satisfaction of the first operand
- // is checked. If that is satisfied, the disjunction is satisfied.
- // Otherwise, the disjunction is satisfied if and only if the second
- // operand is satisfied.
- // LHS is instantiated while RHS is not. Skip creating invalid BinaryOp.
- return LHSRes;
-
- if (Op == clang::OO_AmpAmp && !IsLHSSatisfied)
- // [temp.constr.op] p2
- // A conjunction is a constraint taking two operands. To determine if
- // a conjunction is satisfied, the satisfaction of the first operand
- // is checked. If that is not satisfied, the conjunction is not
- // satisfied. Otherwise, the conjunction is satisfied if and only if
- // the second operand is satisfied.
- // LHS is instantiated while RHS is not. Skip creating invalid BinaryOp.
- return LHSRes;
-
- ExprResult RHSRes = calculateConstraintSatisfaction(
- S, RHS, Template, TemplateNameLoc, MLTAL, Satisfaction);
- if (RHSRes.isInvalid())
- return ExprError();
- bool IsRHSSatisfied = Satisfaction.IsSatisfied;
- // Current implementation adds diagnostic information about the falsity
- // of each false atomic constraint expression when it evaluates them.
- // When the evaluation results to `false || true`, the information
- // generated during the evaluation of left-hand side is meaningless
- // because the whole expression evaluates to true.
- // The following code removes the irrelevant diagnostic information.
- // FIXME: We should probably delay the addition of diagnostic information
- // until we know the entire expression is false.
- if (Op == clang::OO_PipePipe && IsRHSSatisfied) {
- auto EffectiveDetailEnd = Satisfaction.Details.begin();
- std::advance(EffectiveDetailEnd, EffectiveDetailEndIndex);
- Satisfaction.Details.erase(EffectiveDetailEnd, Satisfaction.Details.end());
- }
-
- if (!LHSRes.isUsable() || !RHSRes.isUsable())
- return ExprEmpty();
-
- return BinaryOperator::Create(S.Context, LHSRes.get(), RHSRes.get(),
- BinaryOperator::getOverloadedOpcode(Op),
- S.Context.BoolTy, VK_PRValue, OK_Ordinary,
- LHS->getBeginLoc(), FPOptionsOverride{});
-}
-
-static ExprResult calculateConstraintSatisfaction(
- Sema &S, const CXXFoldExpr *FE, const NamedDecl *Template,
- SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
- ConstraintSatisfaction &Satisfaction) {
- bool Conjunction = FE->getOperator() == BinaryOperatorKind::BO_LAnd;
- size_t EffectiveDetailEndIndex = Satisfaction.Details.size();
-
- ExprResult Out;
- if (FE->isLeftFold() && FE->getInit()) {
- Out = calculateConstraintSatisfaction(S, FE->getInit(), Template,
- TemplateNameLoc, MLTAL, Satisfaction);
- if (Out.isInvalid())
- return ExprError();
+ Sema::SFINAETrap Trap(S);
- // If the first clause of a conjunction is not satisfied,
- // or if the first clause of a disjection is satisfied,
- // we have established satisfaction of the whole constraint
- // and we should not continue further.
- if (Conjunction != Satisfaction.IsSatisfied)
- return Out;
- }
- UnsignedOrNone NumExpansions = EvaluateFoldExpandedConstraintSize(
- S, FE, Template, TemplateNameLoc, MLTAL, Satisfaction);
- if (!NumExpansions)
- return ExprError();
- for (unsigned I = 0; I < *NumExpansions; I++) {
- Sema::ArgPackSubstIndexRAII SubstIndex(S, I);
- ExprResult Res = calculateConstraintSatisfaction(
- S, FE->getPattern(), Template, TemplateNameLoc, MLTAL, Satisfaction);
- if (Res.isInvalid())
- return ExprError();
- bool IsRHSSatisfied = Satisfaction.IsSatisfied;
- if (!Conjunction && IsRHSSatisfied) {
- auto EffectiveDetailEnd = Satisfaction.Details.begin();
- std::advance(EffectiveDetailEnd, EffectiveDetailEndIndex);
- Satisfaction.Details.erase(EffectiveDetailEnd,
- Satisfaction.Details.end());
- }
- if (Out.isUnset())
- Out = Res;
- else if (!Res.isUnset()) {
- Out = BinaryOperator::Create(
- S.Context, Out.get(), Res.get(), FE->getOperator(), S.Context.BoolTy,
- VK_PRValue, OK_Ordinary, FE->getBeginLoc(), FPOptionsOverride{});
- }
- if (Conjunction != IsRHSSatisfied)
- return Out;
+ TemplateArgumentListInfo SubstArgs;
+ Sema::ArgPackSubstIndexRAII SubstIndex(
+ S, Constraint.getPackSubstitutionIndex()
+ ? Constraint.getPackSubstitutionIndex()
+ : PackSubstitutionIndex);
+
+ if (S.SubstTemplateArgumentsInParameterMapping(
+ Constraint.getParameterMapping(), Constraint.getBeginLoc(), MLTAL,
+ SubstArgs, /*BuildPackExpansionTypes=*/true)) {
+ Satisfaction.IsSatisfied = false;
+ return std::nullopt;
}
- if (FE->isRightFold() && FE->getInit()) {
- ExprResult Res = calculateConstraintSatisfaction(
- S, FE->getInit(), Template, TemplateNameLoc, MLTAL, Satisfaction);
- if (Out.isInvalid())
- return ExprError();
-
- if (Out.isUnset())
- Out = Res;
- else if (!Res.isUnset()) {
- Out = BinaryOperator::Create(
- S.Context, Out.get(), Res.get(), FE->getOperator(), S.Context.BoolTy,
- VK_PRValue, OK_Ordinary, FE->getBeginLoc(), FPOptionsOverride{});
+ Sema::CheckTemplateArgumentInfo CTAI;
+ auto *TD = const_cast<TemplateDecl *>(
+ cast<TemplateDecl>(Constraint.getConstraintDecl()));
+ if (S.CheckTemplateArgumentList(TD, Constraint.getUsedTemplateParamList(),
+ TD->getLocation(), SubstArgs,
+ /*DefaultArguments=*/{},
+ /*PartialTemplateArgs=*/false, CTAI))
+ return std::nullopt;
+ const NormalizedConstraint::OccurenceList &Used =
+ Constraint.mappingOccurenceList();
+ SubstitutedOuterMost =
+ llvm::to_vector_of<TemplateArgument>(MLTAL.getOutermost());
+ unsigned Offset = 0;
+ for (unsigned I = 0, MappedIndex = 0; I < Used.size(); I++) {
+ TemplateArgument Arg;
+ if (Used[I])
+ Arg = S.Context.getCanonicalTemplateArgument(
+ CTAI.SugaredConverted[MappedIndex++]);
+ if (I < SubstitutedOuterMost.size()) {
+ SubstitutedOuterMost[I] = Arg;
+ Offset = I + 1;
+ } else {
+ SubstitutedOuterMost.push_back(Arg);
+ Offset = SubstitutedOuterMost.size();
}
}
+ if (Offset < SubstitutedOuterMost.size())
+ SubstitutedOuterMost.erase(SubstitutedOuterMost.begin() + Offset);
- if (Out.isUnset()) {
- Satisfaction.IsSatisfied = Conjunction;
- Out = S.BuildEmptyCXXFoldExpr(FE->getBeginLoc(), FE->getOperator());
- }
- return Out;
+ MLTAL.replaceOutermostTemplateArguments(
+ const_cast<NamedDecl *>(Constraint.getConstraintDecl()),
+ SubstitutedOuterMost);
+ return std::move(MLTAL);
}
-static ExprResult calculateConstraintSatisfaction(
- Sema &S, const Expr *ConstraintExpr, const NamedDecl *Template,
- SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
- ConstraintSatisfaction &Satisfaction) {
- ConstraintExpr = ConstraintExpr->IgnoreParenImpCasts();
-
- if (LogicalBinOp BO = ConstraintExpr)
- return calculateConstraintSatisfaction(
- S, BO.getLHS(), BO.getOp(), BO.getRHS(), Template, TemplateNameLoc,
- MLTAL, Satisfaction);
+ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
+ const AtomicConstraint &Constraint,
+ const MultiLevelTemplateArgumentList &MLTAL) {
- if (auto *C = dyn_cast<ExprWithCleanups>(ConstraintExpr)) {
- // These aren't evaluated, so we don't care about cleanups, so we can just
- // evaluate these as if the cleanups didn't exist.
- return calculateConstraintSatisfaction(
- S, C->getSubExpr(), Template, TemplateNameLoc, MLTAL, Satisfaction);
- }
-
- if (auto *FE = dyn_cast<CXXFoldExpr>(ConstraintExpr);
- FE && S.getLangOpts().CPlusPlus26 &&
- (FE->getOperator() == BinaryOperatorKind::BO_LAnd ||
- FE->getOperator() == BinaryOperatorKind::BO_LOr)) {
- return calculateConstraintSatisfaction(S, FE, Template, TemplateNameLoc,
- MLTAL, Satisfaction);
+ llvm::SmallVector<TemplateArgument> SubstitutedOuterMost;
+ std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs =
+ SubstitutionInTemplateArguments(Constraint, MLTAL, SubstitutedOuterMost);
+ if (!SubstitutedArgs) {
+ Satisfaction.IsSatisfied = false;
+ return ExprEmpty();
}
- // FIXME: We should not treat ConceptSpecializationExpr as atomic constraints.
-
- // An atomic constraint expression
+ Sema::ArgPackSubstIndexRAII SubstIndex(S, PackSubstitutionIndex);
ExprResult SubstitutedAtomicExpr = EvaluateAtomicConstraint(
- S, ConstraintExpr, Template, TemplateNameLoc, MLTAL, Satisfaction);
+ Constraint.getConstraintExpr(), *SubstitutedArgs);
if (SubstitutedAtomicExpr.isInvalid())
return ExprError();
- if (!SubstitutedAtomicExpr.isUsable())
+ if (SubstitutedAtomicExpr.isUnset())
// Evaluator has decided satisfaction without yielding an expression.
return ExprEmpty();
@@ -512,16 +640,16 @@ static ExprResult calculateConstraintSatisfaction(
Satisfaction.ContainsErrors = true;
PartialDiagnostic Msg = S.PDiag(diag::note_constraint_references_error);
- SmallString<128> DiagString;
- DiagString = ": ";
- Msg.EmitToString(S.getDiagnostics(), DiagString);
- unsigned MessageSize = DiagString.size();
- char *Mem = new (S.Context) char[MessageSize];
- memcpy(Mem, DiagString.c_str(), MessageSize);
Satisfaction.Details.emplace_back(
- new (S.Context) ConstraintSatisfaction::SubstitutionDiagnostic{
+ new (S.Context) ConstraintSubstitutionDiagnostic{
SubstitutedAtomicExpr.get()->getBeginLoc(),
- StringRef(Mem, MessageSize)});
+ allocateStringFromConceptDiagnostic(S, Msg)});
+ return SubstitutedAtomicExpr;
+ }
+
+ if (SubstitutedAtomicExpr.get()->isValueDependent()) {
+ Satisfaction.IsSatisfied = true;
+ Satisfaction.ContainsErrors = false;
return SubstitutedAtomicExpr;
}
@@ -552,21 +680,384 @@ static ExprResult calculateConstraintSatisfaction(
return SubstitutedAtomicExpr;
}
-static ExprResult calculateConstraintSatisfaction(
- Sema &S, const NamedDecl *Template, SourceLocation TemplateNameLoc,
- const MultiLevelTemplateArgumentList &MLTAL, const Expr *ConstraintExpr,
- ConstraintSatisfaction &Satisfaction) {
+ExprResult ConstraintSatisfactionChecker::Evaluate(
+ const AtomicConstraint &Constraint,
+ const MultiLevelTemplateArgumentList &MLTAL) {
+
+ unsigned Size = Satisfaction.Details.size();
+ llvm::FoldingSetNodeID ID;
+ UnsignedOrNone OuterPackSubstIndex =
+ Constraint.getPackSubstitutionIndex()
+ ? Constraint.getPackSubstitutionIndex()
+ : PackSubstitutionIndex;
+
+ ID.AddPointer(Constraint.getConstraintExpr());
+ ID.AddInteger(OuterPackSubstIndex.toInternalRepresentation());
+ HashParameterMapping(S, MLTAL, ID, OuterPackSubstIndex)
+ .VisitConstraint(Constraint);
+
+ if (auto Iter = S.UnsubstitutedConstraintSatisfactionCache.find(ID);
+ Iter != S.UnsubstitutedConstraintSatisfactionCache.end()) {
+
+ auto &Cached = Iter->second.Satisfaction;
+ Satisfaction.ContainsErrors = Cached.ContainsErrors;
+ Satisfaction.IsSatisfied = Cached.IsSatisfied;
+ Satisfaction.Details.insert(Satisfaction.Details.begin() + Size,
+ Cached.Details.begin(), Cached.Details.end());
+ return Iter->second.SubstExpr;
+ }
+
+ ExprResult E = EvaluateSlow(Constraint, MLTAL);
+
+ UnsubstitutedConstraintSatisfactionCacheResult Cache;
+ Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors;
+ Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied;
+ std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(),
+ std::back_inserter(Cache.Satisfaction.Details));
+ Cache.SubstExpr = E;
+ S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)});
+
+ return E;
+}
+
+UnsignedOrNone
+ConstraintSatisfactionChecker::EvaluateFoldExpandedConstraintSize(
+ const FoldExpandedConstraint &FE,
+ const MultiLevelTemplateArgumentList &MLTAL) {
+
+ // We should ignore errors in the presence of packs of different size.
+ Sema::SFINAETrap Trap(S);
+
+ Expr *Pattern = const_cast<Expr *>(FE.getPattern());
+
+ SmallVector<UnexpandedParameterPack, 2> Unexpanded;
+ S.collectUnexpandedParameterPacks(Pattern, Unexpanded);
+ assert(!Unexpanded.empty() && "Pack expansion without parameter packs?");
+ bool Expand = true;
+ bool RetainExpansion = false;
+ UnsignedOrNone NumExpansions(std::nullopt);
+ if (S.CheckParameterPacksForExpansion(
+ Pattern->getExprLoc(), Pattern->getSourceRange(), Unexpanded, MLTAL,
+ /*FailOnPackProducingTemplates=*/false, Expand, RetainExpansion,
+ NumExpansions) ||
+ !Expand || RetainExpansion)
+ return std::nullopt;
+
+ if (NumExpansions && S.getLangOpts().BracketDepth < *NumExpansions) {
+ S.Diag(Pattern->getExprLoc(),
+ clang::diag::err_fold_expression_limit_exceeded)
+ << *NumExpansions << S.getLangOpts().BracketDepth
+ << Pattern->getSourceRange();
+ S.Diag(Pattern->getExprLoc(), diag::note_bracket_depth);
+ return std::nullopt;
+ }
+ return NumExpansions;
+}
- return calculateConstraintSatisfaction(S, ConstraintExpr, Template,
- TemplateNameLoc, MLTAL, Satisfaction);
+ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
+ const FoldExpandedConstraint &Constraint,
+ const MultiLevelTemplateArgumentList &MLTAL) {
+
+ bool Conjunction = Constraint.getFoldOperator() ==
+ FoldExpandedConstraint::FoldOperatorKind::And;
+ unsigned EffectiveDetailEndIndex = Satisfaction.Details.size();
+
+ llvm::SmallVector<TemplateArgument> SubstitutedOuterMost;
+ // FIXME: Is PackSubstitutionIndex correct?
+ llvm::SaveAndRestore _(PackSubstitutionIndex, S.ArgPackSubstIndex);
+ std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs =
+ SubstitutionInTemplateArguments(
+ static_cast<const NormalizedConstraintWithParamMapping &>(Constraint),
+ MLTAL, SubstitutedOuterMost);
+ if (!SubstitutedArgs) {
+ Satisfaction.IsSatisfied = false;
+ return ExprError();
+ }
+
+ ExprResult Out;
+ UnsignedOrNone NumExpansions =
+ EvaluateFoldExpandedConstraintSize(Constraint, *SubstitutedArgs);
+ if (!NumExpansions)
+ return ExprEmpty();
+
+ if (*NumExpansions == 0) {
+ Satisfaction.IsSatisfied = Conjunction;
+ return ExprEmpty();
+ }
+
+ for (unsigned I = 0; I < *NumExpansions; I++) {
+ Sema::ArgPackSubstIndexRAII SubstIndex(S, I);
+ Satisfaction.IsSatisfied = false;
+ Satisfaction.ContainsErrors = false;
+ ExprResult Expr =
+ ConstraintSatisfactionChecker(S, Template, TemplateNameLoc,
+ UnsignedOrNone(I), Satisfaction)
+ .Evaluate(Constraint.getNormalizedPattern(), *SubstitutedArgs);
+ if (Expr.isUsable()) {
+ if (Out.isUnset())
+ Out = Expr;
+ else
+ Out = BinaryOperator::Create(S.Context, Out.get(), Expr.get(),
+ Conjunction ? BinaryOperatorKind::BO_LAnd
+ : BinaryOperatorKind::BO_LOr,
+ S.Context.BoolTy, VK_PRValue, OK_Ordinary,
+ Constraint.getBeginLoc(),
+ FPOptionsOverride{});
+ } else {
+ assert(!Satisfaction.IsSatisfied);
+ }
+ if (!Conjunction && Satisfaction.IsSatisfied) {
+ Satisfaction.Details.erase(Satisfaction.Details.begin() +
+ EffectiveDetailEndIndex,
+ Satisfaction.Details.end());
+ break;
+ }
+ if (Satisfaction.IsSatisfied != Conjunction)
+ return Out;
+ }
+
+ return Out;
+}
+
+ExprResult ConstraintSatisfactionChecker::Evaluate(
+ const FoldExpandedConstraint &Constraint,
+ const MultiLevelTemplateArgumentList &MLTAL) {
+
+ llvm::FoldingSetNodeID ID;
+ ID.AddPointer(Constraint.getPattern());
+ HashParameterMapping(S, MLTAL, ID, std::nullopt).VisitConstraint(Constraint);
+
+ if (auto Iter = S.UnsubstitutedConstraintSatisfactionCache.find(ID);
+ Iter != S.UnsubstitutedConstraintSatisfactionCache.end()) {
+
+ auto &Cached = Iter->second.Satisfaction;
+ Satisfaction.ContainsErrors = Cached.ContainsErrors;
+ Satisfaction.IsSatisfied = Cached.IsSatisfied;
+ Satisfaction.Details.insert(Satisfaction.Details.end(),
+ Cached.Details.begin(), Cached.Details.end());
+ return Iter->second.SubstExpr;
+ }
+
+ unsigned Size = Satisfaction.Details.size();
+
+ ExprResult E = EvaluateSlow(Constraint, MLTAL);
+ UnsubstitutedConstraintSatisfactionCacheResult Cache;
+ Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors;
+ Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied;
+ std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(),
+ std::back_inserter(Cache.Satisfaction.Details));
+ Cache.SubstExpr = E;
+ S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)});
+ return E;
+}
+
+ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
+ const ConceptIdConstraint &Constraint,
+ const MultiLevelTemplateArgumentList &MLTAL, unsigned Size) {
+ const ConceptReference *ConceptId = Constraint.getConceptId();
+
+ llvm::SmallVector<TemplateArgument> SubstitutedOuterMost;
+ std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs =
+ SubstitutionInTemplateArguments(Constraint, MLTAL, SubstitutedOuterMost);
+
+ if (!SubstitutedArgs) {
+ Satisfaction.IsSatisfied = false;
+ // FIXME: diagnostics?
+ return ExprError();
+ }
+
+ Sema::SFINAETrap Trap(S);
+ Sema::ArgPackSubstIndexRAII SubstIndex(
+ S, Constraint.getPackSubstitutionIndex()
+ ? Constraint.getPackSubstitutionIndex()
+ : PackSubstitutionIndex);
+
+ const ASTTemplateArgumentListInfo *Ori =
+ ConceptId->getTemplateArgsAsWritten();
+ TemplateDeductionInfo Info(TemplateNameLoc);
+ Sema::InstantiatingTemplate _(
+ S, TemplateNameLoc, Sema::InstantiatingTemplate::ConstraintSubstitution{},
+ const_cast<NamedDecl *>(Template), Info, Constraint.getSourceRange());
+
+ TemplateArgumentListInfo OutArgs(Ori->LAngleLoc, Ori->RAngleLoc);
+ if (S.SubstTemplateArguments(Ori->arguments(), *SubstitutedArgs, OutArgs) ||
+ Trap.hasErrorOccurred()) {
+ Satisfaction.IsSatisfied = false;
+ if (!Trap.hasErrorOccurred())
+ return ExprError();
+
+ PartialDiagnosticAt SubstDiag{SourceLocation(),
+ PartialDiagnostic::NullDiagnostic()};
+ Info.takeSFINAEDiagnostic(SubstDiag);
+ // FIXME: This is an unfortunate consequence of there
+ // being no serialization code for PartialDiagnostics and the fact
+ // that serializing them would likely take a lot more storage than
+ // just storing them as strings. We would still like, in the
+ // future, to serialize the proper PartialDiagnostic as serializing
+ // it as a string defeats the purpose of the diagnostic mechanism.
+ Satisfaction.Details.insert(
+ Satisfaction.Details.begin() + Size,
+ new (S.Context) ConstraintSubstitutionDiagnostic{
+ SubstDiag.first,
+ allocateStringFromConceptDiagnostic(S, SubstDiag.second)});
+ return ExprError();
+ }
+
+ CXXScopeSpec SS;
+ SS.Adopt(ConceptId->getNestedNameSpecifierLoc());
+
+ ExprResult SubstitutedConceptId = S.CheckConceptTemplateId(
+ SS, ConceptId->getTemplateKWLoc(), ConceptId->getConceptNameInfo(),
+ ConceptId->getFoundDecl(), ConceptId->getNamedConcept(), &OutArgs,
+ /*DoCheckConstraintSatisfaction=*/false);
+
+ if (SubstitutedConceptId.isInvalid() || Trap.hasErrorOccurred())
+ return ExprError();
+
+ if (Size != Satisfaction.Details.size()) {
+ Satisfaction.Details.insert(
+ Satisfaction.Details.begin() + Size,
+ UnsatisfiedConstraintRecord(
+ SubstitutedConceptId.getAs<ConceptSpecializationExpr>()
+ ->getConceptReference()));
+ }
+ return SubstitutedConceptId;
+}
+
+ExprResult ConstraintSatisfactionChecker::Evaluate(
+ const ConceptIdConstraint &Constraint,
+ const MultiLevelTemplateArgumentList &MLTAL) {
+
+ const ConceptReference *ConceptId = Constraint.getConceptId();
+
+ UnsignedOrNone OuterPackSubstIndex =
+ Constraint.getPackSubstitutionIndex()
+ ? Constraint.getPackSubstitutionIndex()
+ : PackSubstitutionIndex;
+
+ Sema::InstantiatingTemplate _(S, ConceptId->getBeginLoc(),
+ Sema::InstantiatingTemplate::ConstraintsCheck{},
+ ConceptId->getNamedConcept(),
+ MLTAL.getInnermost(),
+ Constraint.getSourceRange());
+
+ unsigned Size = Satisfaction.Details.size();
+
+ ExprResult E = Evaluate(Constraint.getNormalizedConstraint(), MLTAL);
+
+ if (!E.isUsable()) {
+ Satisfaction.Details.insert(Satisfaction.Details.begin() + Size, ConceptId);
+ return E;
+ }
+
+ // ConceptIdConstraint is only relevant for diagnostics,
+ // so if the normalized constraint is satisfied, we should not
+ // substitute into the constraint.
+ if (Satisfaction.IsSatisfied)
+ return E;
+
+ llvm::FoldingSetNodeID ID;
+ ID.AddPointer(Constraint.getConceptId());
+ ID.AddInteger(OuterPackSubstIndex.toInternalRepresentation());
+ HashParameterMapping(S, MLTAL, ID, OuterPackSubstIndex)
+ .VisitConstraint(Constraint);
+
+ if (auto Iter = S.UnsubstitutedConstraintSatisfactionCache.find(ID);
+ Iter != S.UnsubstitutedConstraintSatisfactionCache.end()) {
+
+ auto &Cached = Iter->second.Satisfaction;
+ Satisfaction.ContainsErrors = Cached.ContainsErrors;
+ Satisfaction.IsSatisfied = Cached.IsSatisfied;
+ Satisfaction.Details.insert(Satisfaction.Details.begin() + Size,
+ Cached.Details.begin(), Cached.Details.end());
+ return Iter->second.SubstExpr;
+ }
+
+ ExprResult CE = EvaluateSlow(Constraint, MLTAL, Size);
+ if (CE.isInvalid())
+ return E;
+ UnsubstitutedConstraintSatisfactionCacheResult Cache;
+ Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors;
+ Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied;
+ std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(),
+ std::back_inserter(Cache.Satisfaction.Details));
+ Cache.SubstExpr = CE;
+ S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)});
+ return CE;
+}
+
+ExprResult ConstraintSatisfactionChecker::Evaluate(
+ const CompoundConstraint &Constraint,
+ const MultiLevelTemplateArgumentList &MLTAL) {
+
+ unsigned EffectiveDetailEndIndex = Satisfaction.Details.size();
+
+ bool Conjunction =
+ Constraint.getCompoundKind() == NormalizedConstraint::CCK_Conjunction;
+
+ ExprResult LHS = Evaluate(Constraint.getLHS(), MLTAL);
+
+ if (Conjunction && (!Satisfaction.IsSatisfied || Satisfaction.ContainsErrors))
+ return LHS;
+
+ if (!Conjunction && LHS.isUsable() && Satisfaction.IsSatisfied &&
+ !Satisfaction.ContainsErrors)
+ return LHS;
+
+ Satisfaction.ContainsErrors = false;
+ Satisfaction.IsSatisfied = false;
+
+ ExprResult RHS = Evaluate(Constraint.getRHS(), MLTAL);
+
+ if (RHS.isUsable() && Satisfaction.IsSatisfied &&
+ !Satisfaction.ContainsErrors)
+ Satisfaction.Details.erase(Satisfaction.Details.begin() +
+ EffectiveDetailEndIndex,
+ Satisfaction.Details.end());
+
+ if (!LHS.isUsable())
+ return RHS;
+
+ if (!RHS.isUsable())
+ return LHS;
+
+ return BinaryOperator::Create(S.Context, LHS.get(), RHS.get(),
+ Conjunction ? BinaryOperatorKind::BO_LAnd
+ : BinaryOperatorKind::BO_LOr,
+ S.Context.BoolTy, VK_PRValue, OK_Ordinary,
+ Constraint.getBeginLoc(), FPOptionsOverride{});
+}
+
+ExprResult ConstraintSatisfactionChecker::Evaluate(
+ const NormalizedConstraint &Constraint,
+ const MultiLevelTemplateArgumentList &MLTAL) {
+ switch (Constraint.getKind()) {
+ case NormalizedConstraint::ConstraintKind::Atomic:
+ return Evaluate(static_cast<const AtomicConstraint &>(Constraint), MLTAL);
+
+ case NormalizedConstraint::ConstraintKind::FoldExpanded:
+ return Evaluate(static_cast<const FoldExpandedConstraint &>(Constraint),
+ MLTAL);
+
+ case NormalizedConstraint::ConstraintKind::ConceptId:
+ return Evaluate(static_cast<const ConceptIdConstraint &>(Constraint),
+ MLTAL);
+
+ case NormalizedConstraint::ConstraintKind::Compound:
+ return Evaluate(static_cast<const CompoundConstraint &>(Constraint), MLTAL);
+ }
}
static bool CheckConstraintSatisfaction(
Sema &S, const NamedDecl *Template,
ArrayRef<AssociatedConstraint> AssociatedConstraints,
- llvm::SmallVectorImpl<Expr *> &Converted,
const MultiLevelTemplateArgumentList &TemplateArgsLists,
- SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction) {
+ SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction,
+ Expr **ConvertedExpr, const ConceptReference *TopLevelConceptId = nullptr) {
+
+ if (ConvertedExpr)
+ *ConvertedExpr = nullptr;
+
if (AssociatedConstraints.empty()) {
Satisfaction.IsSatisfied = true;
return false;
@@ -578,57 +1069,60 @@ static bool CheckConstraintSatisfaction(
return false;
}
- ArrayRef<TemplateArgument> TemplateArgs =
- TemplateArgsLists.getNumSubstitutedLevels() > 0
- ? TemplateArgsLists.getOutermost()
- : ArrayRef<TemplateArgument>{};
- Sema::InstantiatingTemplate Inst(S, TemplateIDRange.getBegin(),
- Sema::InstantiatingTemplate::ConstraintsCheck{},
- const_cast<NamedDecl *>(Template), TemplateArgs, TemplateIDRange);
- if (Inst.isInvalid())
+ llvm::ArrayRef<TemplateArgument> Args;
+ if (TemplateArgsLists.getNumLevels() != 0)
+ Args = TemplateArgsLists.getInnermost();
+
+ std::optional<Sema::InstantiatingTemplate> SynthesisContext;
+ if (!TopLevelConceptId) {
+ SynthesisContext.emplace(S, TemplateIDRange.getBegin(),
+ Sema::InstantiatingTemplate::ConstraintsCheck{},
+ const_cast<NamedDecl *>(Template), Args,
+ TemplateIDRange);
+ }
+
+ const NormalizedConstraint *C =
+ S.getNormalizedAssociatedConstraints(Template, AssociatedConstraints);
+ if (!C) {
+ Satisfaction.IsSatisfied = false;
return true;
+ }
- for (const AssociatedConstraint &AC : AssociatedConstraints) {
- if (AC.isNull())
- return true;
+ if (TopLevelConceptId)
+ C = ConceptIdConstraint::Create(S.getASTContext(), TopLevelConceptId,
+ const_cast<NormalizedConstraint *>(C),
+ Template, /*CSE=*/nullptr,
+ S.ArgPackSubstIndex);
- Sema::ArgPackSubstIndexRAII _(S, AC.ArgPackSubstIndex);
- ExprResult Res = calculateConstraintSatisfaction(
- S, Template, TemplateIDRange.getBegin(), TemplateArgsLists,
- AC.ConstraintExpr, Satisfaction);
- if (Res.isInvalid())
- return true;
+ ExprResult Res =
+ ConstraintSatisfactionChecker(S, Template, TemplateIDRange.getBegin(),
+ S.ArgPackSubstIndex, Satisfaction)
+ .Evaluate(*C, TemplateArgsLists);
+
+ if (Res.isInvalid())
+ return true;
+
+ if (Res.isUsable() && ConvertedExpr)
+ *ConvertedExpr = Res.get();
- Converted.push_back(Res.get());
- if (!Satisfaction.IsSatisfied) {
- // Backfill the 'converted' list with nulls so we can keep the Converted
- // and unconverted lists in sync.
- Converted.append(AssociatedConstraints.size() - Converted.size(),
- nullptr);
- // [temp.constr.op] p2
- // [...] To determine if a conjunction is satisfied, the satisfaction
- // of the first operand is checked. If that is not satisfied, the
- // conjunction is not satisfied. [...]
- return false;
- }
- }
return false;
}
bool Sema::CheckConstraintSatisfaction(
- const NamedDecl *Template,
+ ConstrainedDeclOrNestedRequirement Entity,
ArrayRef<AssociatedConstraint> AssociatedConstraints,
- llvm::SmallVectorImpl<Expr *> &ConvertedConstraints,
const MultiLevelTemplateArgumentList &TemplateArgsLists,
- SourceRange TemplateIDRange, ConstraintSatisfaction &OutSatisfaction) {
+ SourceRange TemplateIDRange, ConstraintSatisfaction &OutSatisfaction,
+ const ConceptReference *TopLevelConceptId, Expr **ConvertedExpr) {
if (AssociatedConstraints.empty()) {
OutSatisfaction.IsSatisfied = true;
return false;
}
+ const auto *Template = Entity.dyn_cast<const NamedDecl *>();
if (!Template) {
return ::CheckConstraintSatisfaction(
- *this, nullptr, AssociatedConstraints, ConvertedConstraints,
- TemplateArgsLists, TemplateIDRange, OutSatisfaction);
+ *this, nullptr, AssociatedConstraints, TemplateArgsLists,
+ TemplateIDRange, OutSatisfaction, ConvertedExpr, TopLevelConceptId);
}
// Invalid templates could make their way here. Substituting them could result
// in dependent expressions.
@@ -643,10 +1137,15 @@ bool Sema::CheckConstraintSatisfaction(
// here.
llvm::SmallVector<TemplateArgument, 4> FlattenedArgs;
for (auto List : TemplateArgsLists)
- llvm::append_range(FlattenedArgs, List.Args);
+ for (const TemplateArgument &Arg : List.Args)
+ FlattenedArgs.emplace_back(Context.getCanonicalTemplateArgument(Arg));
+
+ const NamedDecl *Owner = Template;
+ if (TopLevelConceptId)
+ Owner = TopLevelConceptId->getNamedConcept();
llvm::FoldingSetNodeID ID;
- ConstraintSatisfaction::Profile(ID, Context, Template, FlattenedArgs);
+ ConstraintSatisfaction::Profile(ID, Context, Owner, FlattenedArgs);
void *InsertPos;
if (auto *Cached = SatisfactionCache.FindNodeOrInsertPos(ID, InsertPos)) {
OutSatisfaction = *Cached;
@@ -654,11 +1153,11 @@ bool Sema::CheckConstraintSatisfaction(
}
auto Satisfaction =
- std::make_unique<ConstraintSatisfaction>(Template, FlattenedArgs);
- if (::CheckConstraintSatisfaction(*this, Template, AssociatedConstraints,
- ConvertedConstraints, TemplateArgsLists,
- TemplateIDRange, *Satisfaction)) {
- OutSatisfaction = *Satisfaction;
+ std::make_unique<ConstraintSatisfaction>(Owner, FlattenedArgs);
+ if (::CheckConstraintSatisfaction(
+ *this, Template, AssociatedConstraints, TemplateArgsLists,
+ TemplateIDRange, *Satisfaction, ConvertedExpr, TopLevelConceptId)) {
+ OutSatisfaction = std::move(*Satisfaction);
return true;
}
@@ -688,14 +1187,18 @@ bool Sema::CheckConstraintSatisfaction(
const ConceptSpecializationExpr *ConstraintExpr,
ConstraintSatisfaction &Satisfaction) {
+ llvm::SmallVector<AssociatedConstraint, 1> Constraints;
+ Constraints.emplace_back(
+ ConstraintExpr->getNamedConcept()->getConstraintExpr());
+
MultiLevelTemplateArgumentList MLTAL(ConstraintExpr->getNamedConcept(),
ConstraintExpr->getTemplateArguments(),
true);
- return calculateConstraintSatisfaction(
- *this, ConstraintExpr, ConstraintExpr->getNamedConcept(),
- ConstraintExpr->getConceptNameLoc(), MLTAL, Satisfaction)
- .isInvalid();
+ return CheckConstraintSatisfaction(
+ ConstraintExpr->getNamedConcept(), Constraints, MLTAL,
+ ConstraintExpr->getSourceRange(), Satisfaction,
+ ConstraintExpr->getConceptReference());
}
bool Sema::SetupConstraintScope(
@@ -854,50 +1357,6 @@ bool Sema::CheckFunctionConstraints(const FunctionDecl *FD,
Satisfaction);
}
-
-// Figure out the to-translation-unit depth for this function declaration for
-// the purpose of seeing if they differ by constraints. This isn't the same as
-// getTemplateDepth, because it includes already instantiated parents.
-static unsigned
-CalculateTemplateDepthForConstraints(Sema &S, const NamedDecl *ND,
- bool SkipForSpecialization = false) {
- MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
- ND, ND->getLexicalDeclContext(), /*Final=*/false,
- /*Innermost=*/std::nullopt,
- /*RelativeToPrimary=*/true,
- /*Pattern=*/nullptr,
- /*ForConstraintInstantiation=*/true, SkipForSpecialization);
- return MLTAL.getNumLevels();
-}
-
-namespace {
- class AdjustConstraintDepth : public TreeTransform<AdjustConstraintDepth> {
- unsigned TemplateDepth = 0;
- public:
- using inherited = TreeTransform<AdjustConstraintDepth>;
- AdjustConstraintDepth(Sema &SemaRef, unsigned TemplateDepth)
- : inherited(SemaRef), TemplateDepth(TemplateDepth) {}
-
- using inherited::TransformTemplateTypeParmType;
- QualType TransformTemplateTypeParmType(TypeLocBuilder &TLB,
- TemplateTypeParmTypeLoc TL, bool) {
- const TemplateTypeParmType *T = TL.getTypePtr();
-
- TemplateTypeParmDecl *NewTTPDecl = nullptr;
- if (TemplateTypeParmDecl *OldTTPDecl = T->getDecl())
- NewTTPDecl = cast_or_null<TemplateTypeParmDecl>(
- TransformDecl(TL.getNameLoc(), OldTTPDecl));
-
- QualType Result = getSema().Context.getTemplateTypeParmType(
- T->getDepth() + TemplateDepth, T->getIndex(), T->isParameterPack(),
- NewTTPDecl);
- TemplateTypeParmTypeLoc NewTL = TLB.push<TemplateTypeParmTypeLoc>(Result);
- NewTL.setNameLoc(TL.getNameLoc());
- return Result;
- }
- };
-} // namespace
-
static const Expr *SubstituteConstraintExpressionWithoutSatisfaction(
Sema &S, const Sema::TemplateCompareNewDeclInfo &DeclInfo,
const Expr *ConstrExpr) {
@@ -1161,73 +1620,61 @@ bool Sema::CheckFunctionTemplateConstraints(
static void diagnoseUnsatisfiedRequirement(Sema &S,
concepts::ExprRequirement *Req,
bool First) {
- assert(!Req->isSatisfied()
- && "Diagnose() can only be used on an unsatisfied requirement");
+ assert(!Req->isSatisfied() &&
+ "Diagnose() can only be used on an unsatisfied requirement");
switch (Req->getSatisfactionStatus()) {
- case concepts::ExprRequirement::SS_Dependent:
- llvm_unreachable("Diagnosing a dependent requirement");
- break;
- case concepts::ExprRequirement::SS_ExprSubstitutionFailure: {
- auto *SubstDiag = Req->getExprSubstitutionDiagnostic();
- if (!SubstDiag->DiagMessage.empty())
- S.Diag(SubstDiag->DiagLoc,
- diag::note_expr_requirement_expr_substitution_error)
- << (int)First << SubstDiag->SubstitutedEntity
- << SubstDiag->DiagMessage;
- else
- S.Diag(SubstDiag->DiagLoc,
- diag::note_expr_requirement_expr_unknown_substitution_error)
- << (int)First << SubstDiag->SubstitutedEntity;
- break;
- }
- case concepts::ExprRequirement::SS_NoexceptNotMet:
- S.Diag(Req->getNoexceptLoc(),
- diag::note_expr_requirement_noexcept_not_met)
- << (int)First << Req->getExpr();
- break;
- case concepts::ExprRequirement::SS_TypeRequirementSubstitutionFailure: {
- auto *SubstDiag =
- Req->getReturnTypeRequirement().getSubstitutionDiagnostic();
- if (!SubstDiag->DiagMessage.empty())
- S.Diag(SubstDiag->DiagLoc,
- diag::note_expr_requirement_type_requirement_substitution_error)
- << (int)First << SubstDiag->SubstitutedEntity
- << SubstDiag->DiagMessage;
- else
- S.Diag(SubstDiag->DiagLoc,
- diag::note_expr_requirement_type_requirement_unknown_substitution_error)
- << (int)First << SubstDiag->SubstitutedEntity;
- break;
- }
- case concepts::ExprRequirement::SS_ConstraintsNotSatisfied: {
- ConceptSpecializationExpr *ConstraintExpr =
- Req->getReturnTypeRequirementSubstitutedConstraintExpr();
- if (ConstraintExpr->getTemplateArgsAsWritten()->NumTemplateArgs == 1) {
- // A simple case - expr type is the type being constrained and the concept
- // was not provided arguments.
- Expr *e = Req->getExpr();
- S.Diag(e->getBeginLoc(),
- diag::note_expr_requirement_constraints_not_satisfied_simple)
- << (int)First << S.Context.getReferenceQualifiedType(e)
- << ConstraintExpr->getNamedConcept();
- } else {
- S.Diag(ConstraintExpr->getBeginLoc(),
- diag::note_expr_requirement_constraints_not_satisfied)
- << (int)First << ConstraintExpr;
- }
- S.DiagnoseUnsatisfiedConstraint(ConstraintExpr->getSatisfaction());
- break;
- }
- case concepts::ExprRequirement::SS_Satisfied:
- llvm_unreachable("We checked this above");
+ case concepts::ExprRequirement::SS_Dependent:
+ llvm_unreachable("Diagnosing a dependent requirement");
+ break;
+ case concepts::ExprRequirement::SS_ExprSubstitutionFailure: {
+ auto *SubstDiag = Req->getExprSubstitutionDiagnostic();
+ if (!SubstDiag->DiagMessage.empty())
+ S.Diag(SubstDiag->DiagLoc,
+ diag::note_expr_requirement_expr_substitution_error)
+ << (int)First << SubstDiag->SubstitutedEntity
+ << SubstDiag->DiagMessage;
+ else
+ S.Diag(SubstDiag->DiagLoc,
+ diag::note_expr_requirement_expr_unknown_substitution_error)
+ << (int)First << SubstDiag->SubstitutedEntity;
+ break;
+ }
+ case concepts::ExprRequirement::SS_NoexceptNotMet:
+ S.Diag(Req->getNoexceptLoc(), diag::note_expr_requirement_noexcept_not_met)
+ << (int)First << Req->getExpr();
+ break;
+ case concepts::ExprRequirement::SS_TypeRequirementSubstitutionFailure: {
+ auto *SubstDiag =
+ Req->getReturnTypeRequirement().getSubstitutionDiagnostic();
+ if (!SubstDiag->DiagMessage.empty())
+ S.Diag(SubstDiag->DiagLoc,
+ diag::note_expr_requirement_type_requirement_substitution_error)
+ << (int)First << SubstDiag->SubstitutedEntity
+ << SubstDiag->DiagMessage;
+ else
+ S.Diag(
+ SubstDiag->DiagLoc,
+ diag::
+ note_expr_requirement_type_requirement_unknown_substitution_error)
+ << (int)First << SubstDiag->SubstitutedEntity;
+ break;
+ }
+ case concepts::ExprRequirement::SS_ConstraintsNotSatisfied: {
+ ConceptSpecializationExpr *ConstraintExpr =
+ Req->getReturnTypeRequirementSubstitutedConstraintExpr();
+ S.DiagnoseUnsatisfiedConstraint(ConstraintExpr);
+ break;
+ }
+ case concepts::ExprRequirement::SS_Satisfied:
+ llvm_unreachable("We checked this above");
}
}
static void diagnoseUnsatisfiedRequirement(Sema &S,
concepts::TypeRequirement *Req,
bool First) {
- assert(!Req->isSatisfied()
- && "Diagnose() can only be used on an unsatisfied requirement");
+ assert(!Req->isSatisfied() &&
+ "Diagnose() can only be used on an unsatisfied requirement");
switch (Req->getSatisfactionStatus()) {
case concepts::TypeRequirement::SS_Dependent:
llvm_unreachable("Diagnosing a dependent requirement");
@@ -1235,9 +1682,9 @@ static void diagnoseUnsatisfiedRequirement(Sema &S,
case concepts::TypeRequirement::SS_SubstitutionFailure: {
auto *SubstDiag = Req->getSubstitutionDiagnostic();
if (!SubstDiag->DiagMessage.empty())
- S.Diag(SubstDiag->DiagLoc,
- diag::note_type_requirement_substitution_error) << (int)First
- << SubstDiag->SubstitutedEntity << SubstDiag->DiagMessage;
+ S.Diag(SubstDiag->DiagLoc, diag::note_type_requirement_substitution_error)
+ << (int)First << SubstDiag->SubstitutedEntity
+ << SubstDiag->DiagMessage;
else
S.Diag(SubstDiag->DiagLoc,
diag::note_type_requirement_unknown_substitution_error)
@@ -1249,31 +1696,53 @@ static void diagnoseUnsatisfiedRequirement(Sema &S,
return;
}
}
-static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
- Expr *SubstExpr,
- bool First = true);
+
+static void diagnoseUnsatisfiedConceptIdExpr(Sema &S,
+ const ConceptReference *Concept,
+ SourceLocation Loc, bool First) {
+ if (Concept->getTemplateArgsAsWritten()->NumTemplateArgs == 1) {
+ S.Diag(
+ Loc,
+ diag::
+ note_single_arg_concept_specialization_constraint_evaluated_to_false)
+ << (int)First
+ << Concept->getTemplateArgsAsWritten()->arguments()[0].getArgument()
+ << Concept->getNamedConcept();
+ } else {
+ S.Diag(Loc, diag::note_concept_specialization_constraint_evaluated_to_false)
+ << (int)First << Concept;
+ }
+}
+
+static void diagnoseUnsatisfiedConstraintExpr(
+ Sema &S, const UnsatisfiedConstraintRecord &Record, SourceLocation Loc,
+ bool First, concepts::NestedRequirement *Req = nullptr);
+
+static void DiagnoseUnsatisfiedConstraint(
+ Sema &S, ArrayRef<UnsatisfiedConstraintRecord> Records, SourceLocation Loc,
+ bool First = true, concepts::NestedRequirement *Req = nullptr) {
+ for (auto &Record : Records) {
+ diagnoseUnsatisfiedConstraintExpr(S, Record, Loc, First, Req);
+ Loc = {};
+ First = isa<const ConceptReference *>(Record);
+ }
+}
static void diagnoseUnsatisfiedRequirement(Sema &S,
concepts::NestedRequirement *Req,
bool First) {
- using SubstitutionDiagnostic = std::pair<SourceLocation, StringRef>;
- for (auto &Record : Req->getConstraintSatisfaction()) {
- if (auto *SubstDiag = Record.dyn_cast<SubstitutionDiagnostic *>())
- S.Diag(SubstDiag->first, diag::note_nested_requirement_substitution_error)
- << (int)First << Req->getInvalidConstraintEntity()
- << SubstDiag->second;
- else
- diagnoseWellFormedUnsatisfiedConstraintExpr(S, Record.dyn_cast<Expr *>(),
- First);
- First = false;
- }
+ DiagnoseUnsatisfiedConstraint(S, Req->getConstraintSatisfaction().records(),
+ Req->hasInvalidConstraint()
+ ? SourceLocation()
+ : Req->getConstraintExpr()->getExprLoc(),
+ First, Req);
}
static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
- Expr *SubstExpr,
+ const Expr *SubstExpr,
bool First) {
SubstExpr = SubstExpr->IgnoreParenImpCasts();
- if (BinaryOperator *BO = dyn_cast<BinaryOperator>(SubstExpr)) {
+ if (const BinaryOperator *BO = dyn_cast<BinaryOperator>(SubstExpr)) {
switch (BO->getOpcode()) {
// These two cases will in practice only be reached when using fold
// expressions with || and &&, since otherwise the || and && will have been
@@ -1319,7 +1788,7 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
BO->getRHS()->EvaluateAsInt(SimplifiedRHS, S.Context,
Expr::SE_NoSideEffects,
/*InConstantContext=*/true);
- if (!SimplifiedLHS.Diag && ! SimplifiedRHS.Diag) {
+ if (!SimplifiedLHS.Diag && !SimplifiedRHS.Diag) {
S.Diag(SubstExpr->getBeginLoc(),
diag::note_atomic_constraint_evaluated_to_false_elaborated)
<< (int)First << SubstExpr
@@ -1334,22 +1803,6 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
default:
break;
}
- } else if (auto *CSE = dyn_cast<ConceptSpecializationExpr>(SubstExpr)) {
- if (CSE->getTemplateArgsAsWritten()->NumTemplateArgs == 1) {
- S.Diag(
- CSE->getSourceRange().getBegin(),
- diag::
- note_single_arg_concept_specialization_constraint_evaluated_to_false)
- << (int)First
- << CSE->getTemplateArgsAsWritten()->arguments()[0].getArgument()
- << CSE->getNamedConcept();
- } else {
- S.Diag(SubstExpr->getSourceRange().getBegin(),
- diag::note_concept_specialization_constraint_evaluated_to_false)
- << (int)First << CSE;
- }
- S.DiagnoseUnsatisfiedConstraint(CSE->getSatisfaction());
- return;
} else if (auto *RE = dyn_cast<RequiresExpr>(SubstExpr)) {
// FIXME: RequiresExpr should store dependent diagnostics.
for (concepts::Requirement *Req : RE->getRequirements())
@@ -1364,6 +1817,10 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
break;
}
return;
+ } else if (auto *CSE = dyn_cast<ConceptSpecializationExpr>(SubstExpr)) {
+ // Drill down concept ids treated as atomic constraints
+ S.DiagnoseUnsatisfiedConstraint(CSE, First);
+ return;
} else if (auto *TTE = dyn_cast<TypeTraitExpr>(SubstExpr);
TTE && TTE->getTrait() == clang::TypeTrait::BTT_IsDeducible) {
assert(TTE->getNumArgs() == 2);
@@ -1379,216 +1836,332 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
S.DiagnoseTypeTraitDetails(SubstExpr);
}
-template <typename SubstitutionDiagnostic>
static void diagnoseUnsatisfiedConstraintExpr(
- Sema &S, const llvm::PointerUnion<Expr *, SubstitutionDiagnostic *> &Record,
- bool First = true) {
- if (auto *Diag = Record.template dyn_cast<SubstitutionDiagnostic *>()) {
- S.Diag(Diag->first, diag::note_substituted_constraint_expr_is_ill_formed)
- << Diag->second;
+ Sema &S, const UnsatisfiedConstraintRecord &Record, SourceLocation Loc,
+ bool First, concepts::NestedRequirement *Req) {
+ if (auto *Diag =
+ Record
+ .template dyn_cast<const ConstraintSubstitutionDiagnostic *>()) {
+ if (Req)
+ S.Diag(Diag->first, diag::note_nested_requirement_substitution_error)
+ << (int)First << Req->getInvalidConstraintEntity() << Diag->second;
+ else
+ S.Diag(Diag->first, diag::note_substituted_constraint_expr_is_ill_formed)
+ << Diag->second;
return;
}
-
- diagnoseWellFormedUnsatisfiedConstraintExpr(S, cast<Expr *>(Record), First);
+ if (const auto *Concept = dyn_cast<const ConceptReference *>(Record)) {
+ if (Loc.isInvalid())
+ Loc = Concept->getBeginLoc();
+ diagnoseUnsatisfiedConceptIdExpr(S, Concept, Loc, First);
+ return;
+ }
+ diagnoseWellFormedUnsatisfiedConstraintExpr(
+ S, cast<const class Expr *>(Record), First);
}
-void
-Sema::DiagnoseUnsatisfiedConstraint(const ConstraintSatisfaction& Satisfaction,
- bool First) {
+void Sema::DiagnoseUnsatisfiedConstraint(
+ const ConstraintSatisfaction &Satisfaction, SourceLocation Loc,
+ bool First) {
+
assert(!Satisfaction.IsSatisfied &&
"Attempted to diagnose a satisfied constraint");
- for (auto &Record : Satisfaction.Details) {
- diagnoseUnsatisfiedConstraintExpr(*this, Record, First);
- First = false;
- }
+ ::DiagnoseUnsatisfiedConstraint(*this, Satisfaction.Details, Loc, First);
}
void Sema::DiagnoseUnsatisfiedConstraint(
- const ASTConstraintSatisfaction &Satisfaction,
- bool First) {
+ const ConceptSpecializationExpr *ConstraintExpr, bool First) {
+
+ const ASTConstraintSatisfaction &Satisfaction =
+ ConstraintExpr->getSatisfaction();
+
assert(!Satisfaction.IsSatisfied &&
"Attempted to diagnose a satisfied constraint");
- for (auto &Record : Satisfaction) {
- diagnoseUnsatisfiedConstraintExpr(*this, Record, First);
- First = false;
- }
+
+ ::DiagnoseUnsatisfiedConstraint(*this, Satisfaction.records(),
+ ConstraintExpr->getBeginLoc(), First);
}
-const NormalizedConstraint *Sema::getNormalizedAssociatedConstraints(
- const NamedDecl *ConstrainedDecl,
- ArrayRef<AssociatedConstraint> AssociatedConstraints) {
- // In case the ConstrainedDecl comes from modules, it is necessary to use
- // the canonical decl to avoid different atomic constraints with the 'same'
- // declarations.
- ConstrainedDecl = cast<NamedDecl>(ConstrainedDecl->getCanonicalDecl());
+namespace {
- auto CacheEntry = NormalizationCache.find(ConstrainedDecl);
- if (CacheEntry == NormalizationCache.end()) {
- auto Normalized = NormalizedConstraint::fromAssociatedConstraints(
- *this, ConstrainedDecl, AssociatedConstraints);
- CacheEntry =
- NormalizationCache
- .try_emplace(ConstrainedDecl,
- Normalized
- ? new (Context) NormalizedConstraint(
- std::move(*Normalized))
- : nullptr)
- .first;
- }
- return CacheEntry->second;
-}
+class SubstituteParameterMappings {
+ Sema &SemaRef;
-const NormalizedConstraint *clang::getNormalizedAssociatedConstraints(
- Sema &S, const NamedDecl *ConstrainedDecl,
- ArrayRef<AssociatedConstraint> AssociatedConstraints) {
- return S.getNormalizedAssociatedConstraints(ConstrainedDecl,
- AssociatedConstraints);
-}
+ const MultiLevelTemplateArgumentList *MLTAL;
+ const ASTTemplateArgumentListInfo *ArgsAsWritten;
-static bool
-substituteParameterMappings(Sema &S, NormalizedConstraint &N,
- ConceptDecl *Concept,
- const MultiLevelTemplateArgumentList &MLTAL,
- const ASTTemplateArgumentListInfo *ArgsAsWritten) {
+ bool InFoldExpr;
- if (N.isCompound()) {
- if (substituteParameterMappings(S, N.getLHS(), Concept, MLTAL,
- ArgsAsWritten))
- return true;
- return substituteParameterMappings(S, N.getRHS(), Concept, MLTAL,
- ArgsAsWritten);
- }
+ SubstituteParameterMappings(Sema &SemaRef,
+ const MultiLevelTemplateArgumentList *MLTAL,
+ const ASTTemplateArgumentListInfo *ArgsAsWritten,
+ bool InFoldExpr)
+ : SemaRef(SemaRef), MLTAL(MLTAL), ArgsAsWritten(ArgsAsWritten),
+ InFoldExpr(InFoldExpr) {}
+
+ void buildParameterMapping(NormalizedConstraintWithParamMapping &N);
+
+ bool substitute(NormalizedConstraintWithParamMapping &N);
+
+ bool substitute(ConceptIdConstraint &CC);
+
+public:
+ SubstituteParameterMappings(Sema &SemaRef, bool InFoldExpr = false)
+ : SemaRef(SemaRef), MLTAL(nullptr), ArgsAsWritten(nullptr),
+ InFoldExpr(InFoldExpr) {}
+
+ bool substitute(NormalizedConstraint &N);
+};
- if (N.isFoldExpanded()) {
- Sema::ArgPackSubstIndexRAII _(S, std::nullopt);
- return substituteParameterMappings(
- S, N.getFoldExpandedConstraint()->Constraint, Concept, MLTAL,
- ArgsAsWritten);
+void SubstituteParameterMappings::buildParameterMapping(
+ NormalizedConstraintWithParamMapping &N) {
+ TemplateParameterList *TemplateParams =
+ cast<TemplateDecl>(N.getConstraintDecl())->getTemplateParameters();
+
+ llvm::SmallBitVector OccurringIndices(TemplateParams->size());
+ llvm::SmallBitVector OccurringIndicesForSubsumption(TemplateParams->size());
+
+ if (N.getKind() == NormalizedConstraint::ConstraintKind::Atomic) {
+ SemaRef.MarkUsedTemplateParameters(
+ static_cast<AtomicConstraint &>(N).getConstraintExpr(),
+ /*OnlyDeduced=*/false,
+ /*Depth=*/0, OccurringIndices);
+
+ SemaRef.MarkUsedTemplateParametersForSubsumptionParameterMapping(
+ static_cast<AtomicConstraint &>(N).getConstraintExpr(),
+ /*Depth=*/0, OccurringIndicesForSubsumption);
+
+ } else if (N.getKind() ==
+ NormalizedConstraint::ConstraintKind::FoldExpanded) {
+ SemaRef.MarkUsedTemplateParameters(
+ static_cast<FoldExpandedConstraint &>(N).getPattern(),
+ /*OnlyDeduced=*/false,
+ /*Depth=*/0, OccurringIndices);
+ } else if (N.getKind() == NormalizedConstraint::ConstraintKind::ConceptId) {
+ auto *Args = static_cast<ConceptIdConstraint &>(N)
+ .getConceptId()
+ ->getTemplateArgsAsWritten();
+ if (Args)
+ SemaRef.MarkUsedTemplateParameters(Args->arguments(),
+ /*Depth=*/0, OccurringIndices);
}
+ TemplateArgumentLoc *TempArgs =
+ new (SemaRef.Context) TemplateArgumentLoc[OccurringIndices.count()];
+ llvm::SmallVector<NamedDecl *> UsedParams;
+ for (unsigned I = 0, J = 0, C = TemplateParams->size(); I != C; ++I) {
+ SourceLocation Loc = ArgsAsWritten->NumTemplateArgs > I
+ ? ArgsAsWritten->arguments()[I].getLocation()
+ : SourceLocation();
+ // FIXME: Investigate why we couldn't always preserve the SourceLoc. We
+ // can't assert Loc.isValid() now.
+ if (OccurringIndices[I]) {
+ NamedDecl *Param = TemplateParams->begin()[I];
+ new (&(TempArgs)[J]) TemplateArgumentLoc(
+ SemaRef.getIdentityTemplateArgumentLoc(Param, Loc));
+ UsedParams.push_back(Param);
+ J++;
+ }
+ }
+ auto *UsedList = TemplateParameterList::Create(
+ SemaRef.Context, TemplateParams->getTemplateLoc(),
+ TemplateParams->getLAngleLoc(), UsedParams,
+ /*RAngleLoc=*/SourceLocation(),
+ /*RequiresClause=*/nullptr);
+ unsigned Size = OccurringIndices.count();
+ N.updateParameterMapping(
+ std::move(OccurringIndices), std::move(OccurringIndicesForSubsumption),
+ MutableArrayRef<TemplateArgumentLoc>{TempArgs, Size}, UsedList);
+}
- TemplateParameterList *TemplateParams = Concept->getTemplateParameters();
+bool SubstituteParameterMappings::substitute(
+ NormalizedConstraintWithParamMapping &N) {
+ if (!N.hasParameterMapping())
+ buildParameterMapping(N);
- AtomicConstraint &Atomic = *N.getAtomicConstraint();
- TemplateArgumentListInfo SubstArgs;
- if (!Atomic.ParameterMapping) {
- llvm::SmallBitVector OccurringIndices(TemplateParams->size());
- S.MarkUsedTemplateParameters(Atomic.ConstraintExpr, /*OnlyDeduced=*/false,
- /*Depth=*/0, OccurringIndices);
- TemplateArgumentLoc *TempArgs =
- new (S.Context) TemplateArgumentLoc[OccurringIndices.count()];
- for (unsigned I = 0, J = 0, C = TemplateParams->size(); I != C; ++I)
- if (OccurringIndices[I])
- new (&(TempArgs)[J++])
- TemplateArgumentLoc(S.getIdentityTemplateArgumentLoc(
- TemplateParams->begin()[I],
- // Here we assume we do not support things like
- // template<typename A, typename B>
- // concept C = ...;
- //
- // template<typename... Ts> requires C<Ts...>
- // struct S { };
- // The above currently yields a diagnostic.
- // We still might have default arguments for concept parameters.
- ArgsAsWritten->NumTemplateArgs > I
- ? ArgsAsWritten->arguments()[I].getLocation()
- : SourceLocation()));
- Atomic.ParameterMapping.emplace(TempArgs, OccurringIndices.count());
- }
- SourceLocation InstLocBegin =
- ArgsAsWritten->arguments().empty()
- ? ArgsAsWritten->getLAngleLoc()
- : ArgsAsWritten->arguments().front().getSourceRange().getBegin();
- SourceLocation InstLocEnd =
- ArgsAsWritten->arguments().empty()
- ? ArgsAsWritten->getRAngleLoc()
- : ArgsAsWritten->arguments().front().getSourceRange().getEnd();
+ SourceLocation InstLocBegin, InstLocEnd;
+ llvm::ArrayRef Arguments = ArgsAsWritten->arguments();
+ if (Arguments.empty()) {
+ InstLocBegin = ArgsAsWritten->getLAngleLoc();
+ InstLocEnd = ArgsAsWritten->getRAngleLoc();
+ } else {
+ auto SR = Arguments[0].getSourceRange();
+ InstLocBegin = SR.getBegin();
+ InstLocEnd = SR.getEnd();
+ }
Sema::InstantiatingTemplate Inst(
- S, InstLocBegin,
+ SemaRef, InstLocBegin,
Sema::InstantiatingTemplate::ParameterMappingSubstitution{},
- const_cast<NamedDecl *>(Atomic.ConstraintDecl),
+ const_cast<NamedDecl *>(N.getConstraintDecl()),
{InstLocBegin, InstLocEnd});
if (Inst.isInvalid())
return true;
- if (S.SubstTemplateArguments(*Atomic.ParameterMapping, MLTAL, SubstArgs))
+
+ // TransformTemplateArguments is unable to preserve the source location of a
+ // pack. The SourceLocation is necessary for the instantiation location.
+ // FIXME: The BaseLoc will be used as the location of the pack expansion,
+ // which is wrong.
+ TemplateArgumentListInfo SubstArgs;
+ if (SemaRef.SubstTemplateArgumentsInParameterMapping(
+ N.getParameterMapping(), N.getBeginLoc(), *MLTAL, SubstArgs,
+ /*BuildPackExpansionTypes=*/!InFoldExpr))
+ return true;
+ Sema::CheckTemplateArgumentInfo CTAI;
+ auto *TD =
+ const_cast<TemplateDecl *>(cast<TemplateDecl>(N.getConstraintDecl()));
+ if (SemaRef.CheckTemplateArgumentList(TD, N.getUsedTemplateParamList(),
+ TD->getLocation(), SubstArgs,
+ /*DefaultArguments=*/{},
+ /*PartialTemplateArgs=*/false, CTAI))
return true;
TemplateArgumentLoc *TempArgs =
- new (S.Context) TemplateArgumentLoc[SubstArgs.size()];
- std::copy(SubstArgs.arguments().begin(), SubstArgs.arguments().end(),
- TempArgs);
- Atomic.ParameterMapping.emplace(TempArgs, SubstArgs.size());
+ new (SemaRef.Context) TemplateArgumentLoc[CTAI.SugaredConverted.size()];
+
+ for (unsigned I = 0; I < CTAI.SugaredConverted.size(); ++I) {
+ SourceLocation Loc;
+ // If this is an empty pack, we have no corresponding SubstArgs.
+ if (I < SubstArgs.size())
+ Loc = SubstArgs.arguments()[I].getLocation();
+
+ TempArgs[I] = SemaRef.getTrivialTemplateArgumentLoc(
+ CTAI.SugaredConverted[I], QualType(), Loc);
+ }
+
+ MutableArrayRef<TemplateArgumentLoc> Mapping(TempArgs,
+ CTAI.SugaredConverted.size());
+ N.updateParameterMapping(N.mappingOccurenceList(),
+ N.mappingOccurenceListForSubsumption(), Mapping,
+ N.getUsedTemplateParamList());
return false;
}
-static bool substituteParameterMappings(Sema &S, NormalizedConstraint &N,
- const ConceptSpecializationExpr *CSE) {
- MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
- CSE->getNamedConcept(), CSE->getNamedConcept()->getLexicalDeclContext(),
- /*Final=*/false, CSE->getTemplateArguments(),
- /*RelativeToPrimary=*/true,
- /*Pattern=*/nullptr,
- /*ForConstraintInstantiation=*/true);
+bool SubstituteParameterMappings::substitute(ConceptIdConstraint &CC) {
+ assert(CC.getConstraintDecl() && MLTAL && ArgsAsWritten);
- return substituteParameterMappings(S, N, CSE->getNamedConcept(), MLTAL,
- CSE->getTemplateArgsAsWritten());
-}
+ if (substitute(static_cast<NormalizedConstraintWithParamMapping &>(CC)))
+ return true;
-NormalizedConstraint::NormalizedConstraint(ASTContext &C,
- NormalizedConstraint LHS,
- NormalizedConstraint RHS,
- CompoundConstraintKind Kind)
- : Constraint{CompoundConstraint{
- new(C) NormalizedConstraintPair{std::move(LHS), std::move(RHS)},
- Kind}} {}
-
-NormalizedConstraint::NormalizedConstraint(ASTContext &C,
- const NormalizedConstraint &Other) {
- if (Other.isAtomic()) {
- Constraint = new (C) AtomicConstraint(*Other.getAtomicConstraint());
- } else if (Other.isFoldExpanded()) {
- Constraint = new (C) FoldExpandedConstraint(
- Other.getFoldExpandedConstraint()->Kind,
- NormalizedConstraint(C, Other.getFoldExpandedConstraint()->Constraint),
- Other.getFoldExpandedConstraint()->Pattern);
+ auto *CSE = CC.getConceptSpecializationExpr();
+ assert(CSE);
+ assert(!CC.getBeginLoc().isInvalid());
+
+ SourceLocation InstLocBegin, InstLocEnd;
+ if (llvm::ArrayRef Arguments = ArgsAsWritten->arguments();
+ Arguments.empty()) {
+ InstLocBegin = ArgsAsWritten->getLAngleLoc();
+ InstLocEnd = ArgsAsWritten->getRAngleLoc();
} else {
- Constraint = CompoundConstraint(
- new (C)
- NormalizedConstraintPair{NormalizedConstraint(C, Other.getLHS()),
- NormalizedConstraint(C, Other.getRHS())},
- Other.getCompoundKind());
+ auto SR = Arguments[0].getSourceRange();
+ InstLocBegin = SR.getBegin();
+ InstLocEnd = SR.getEnd();
}
-}
+ // This is useful for name lookup across modules; see Sema::getLookupModules.
+ Sema::InstantiatingTemplate Inst(
+ SemaRef, InstLocBegin,
+ Sema::InstantiatingTemplate::ParameterMappingSubstitution{},
+ const_cast<NamedDecl *>(CC.getConstraintDecl()),
+ {InstLocBegin, InstLocEnd});
+ if (Inst.isInvalid())
+ return true;
-NormalizedConstraint &NormalizedConstraint::getLHS() const {
- assert(isCompound() && "getLHS called on a non-compound constraint.");
- return cast<CompoundConstraint>(Constraint).getPointer()->LHS;
+ TemplateArgumentListInfo Out;
+ // TransformTemplateArguments is unable to preserve the source location of a
+ // pack. The SourceLocation is necessary for the instantiation location.
+ // FIXME: The BaseLoc will be used as the location of the pack expansion,
+ // which is wrong.
+ const ASTTemplateArgumentListInfo *ArgsAsWritten =
+ CSE->getTemplateArgsAsWritten();
+ if (SemaRef.SubstTemplateArgumentsInParameterMapping(
+ ArgsAsWritten->arguments(), CC.getBeginLoc(), *MLTAL, Out,
+ /*BuildPackExpansionTypes=*/!InFoldExpr))
+ return true;
+ Sema::CheckTemplateArgumentInfo CTAI;
+ if (SemaRef.CheckTemplateArgumentList(CSE->getNamedConcept(),
+ CSE->getConceptNameInfo().getLoc(), Out,
+ /*DefaultArgs=*/{},
+ /*PartialTemplateArgs=*/false, CTAI,
+ /*UpdateArgsWithConversions=*/false))
+ return true;
+ auto TemplateArgs = *MLTAL;
+ TemplateArgs.replaceOutermostTemplateArguments(
+ TemplateArgs.getAssociatedDecl(0).first, CTAI.SugaredConverted);
+ return SubstituteParameterMappings(SemaRef, &TemplateArgs, ArgsAsWritten,
+ InFoldExpr)
+ .substitute(CC.getNormalizedConstraint());
}
-NormalizedConstraint &NormalizedConstraint::getRHS() const {
- assert(isCompound() && "getRHS called on a non-compound constraint.");
- return cast<CompoundConstraint>(Constraint).getPointer()->RHS;
+bool SubstituteParameterMappings::substitute(NormalizedConstraint &N) {
+ switch (N.getKind()) {
+ case NormalizedConstraint::ConstraintKind::Atomic: {
+ if (!MLTAL) {
+ assert(!ArgsAsWritten);
+ return false;
+ }
+ return substitute(static_cast<NormalizedConstraintWithParamMapping &>(N));
+ }
+ case NormalizedConstraint::ConstraintKind::FoldExpanded: {
+ auto &FE = static_cast<FoldExpandedConstraint &>(N);
+ if (!MLTAL) {
+ llvm::SaveAndRestore _1(InFoldExpr, true);
+ assert(!ArgsAsWritten);
+ return substitute(FE.getNormalizedPattern());
+ }
+ Sema::ArgPackSubstIndexRAII _(SemaRef, std::nullopt);
+ substitute(static_cast<NormalizedConstraintWithParamMapping &>(FE));
+ return SubstituteParameterMappings(SemaRef, /*InFoldExpr=*/true)
+ .substitute(FE.getNormalizedPattern());
+ }
+ case NormalizedConstraint::ConstraintKind::ConceptId: {
+ auto &CC = static_cast<ConceptIdConstraint &>(N);
+ if (MLTAL) {
+ assert(ArgsAsWritten);
+ return substitute(CC);
+ }
+ assert(!ArgsAsWritten);
+ const ConceptSpecializationExpr *CSE = CC.getConceptSpecializationExpr();
+ ConceptDecl *Concept = CSE->getNamedConcept();
+ MultiLevelTemplateArgumentList MLTAL = SemaRef.getTemplateInstantiationArgs(
+ Concept, Concept->getLexicalDeclContext(),
+ /*Final=*/true, CSE->getTemplateArguments(),
+ /*RelativeToPrimary=*/true,
+ /*Pattern=*/nullptr,
+ /*ForConstraintInstantiation=*/true);
+
+ return SubstituteParameterMappings(
+ SemaRef, &MLTAL, CSE->getTemplateArgsAsWritten(), InFoldExpr)
+ .substitute(CC.getNormalizedConstraint());
+ }
+ case NormalizedConstraint::ConstraintKind::Compound: {
+ auto &Compound = static_cast<CompoundConstraint &>(N);
+ if (substitute(Compound.getLHS()))
+ return true;
+ return substitute(Compound.getRHS());
+ }
+ }
}
-std::optional<NormalizedConstraint>
-NormalizedConstraint::fromAssociatedConstraints(
+} // namespace
+
+NormalizedConstraint *NormalizedConstraint::fromAssociatedConstraints(
Sema &S, const NamedDecl *D, ArrayRef<AssociatedConstraint> ACs) {
assert(ACs.size() != 0);
- auto Conjunction = fromConstraintExpr(S, D, ACs[0].ConstraintExpr);
+ auto *Conjunction =
+ fromConstraintExpr(S, D, ACs[0].ConstraintExpr, ACs[0].ArgPackSubstIndex);
if (!Conjunction)
- return std::nullopt;
+ return nullptr;
for (unsigned I = 1; I < ACs.size(); ++I) {
- auto Next = fromConstraintExpr(S, D, ACs[I].ConstraintExpr);
+ auto *Next = fromConstraintExpr(S, D, ACs[I].ConstraintExpr,
+ ACs[I].ArgPackSubstIndex);
if (!Next)
- return std::nullopt;
- *Conjunction = NormalizedConstraint(S.Context, std::move(*Conjunction),
- std::move(*Next), CCK_Conjunction);
+ return nullptr;
+ Conjunction = CompoundConstraint::CreateConjunction(S.getASTContext(),
+ Conjunction, Next);
}
return Conjunction;
}
-std::optional<NormalizedConstraint>
-NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D,
- const Expr *E) {
+NormalizedConstraint *NormalizedConstraint::fromConstraintExpr(
+ Sema &S, const NamedDecl *D, const Expr *E, UnsignedOrNone SubstIndex) {
assert(E != nullptr);
// C++ [temp.constr.normal]p1.1
@@ -1597,23 +2170,29 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D,
// [...]
E = E->IgnoreParenImpCasts();
+ llvm::FoldingSetNodeID ID;
+ if (D && DiagRecursiveConstraintEval(S, ID, D, E)) {
+ return nullptr;
+ }
+ SatisfactionStackRAII StackRAII(S, D, ID);
+
// C++2a [temp.param]p4:
// [...] If T is not a pack, then E is E', otherwise E is (E' && ...).
// Fold expression is considered atomic constraints per current wording.
// See http://cplusplus.github.io/concepts-ts/ts-active.html#28
if (LogicalBinOp BO = E) {
- auto LHS = fromConstraintExpr(S, D, BO.getLHS());
+ auto *LHS = fromConstraintExpr(S, D, BO.getLHS(), SubstIndex);
if (!LHS)
- return std::nullopt;
- auto RHS = fromConstraintExpr(S, D, BO.getRHS());
+ return nullptr;
+ auto *RHS = fromConstraintExpr(S, D, BO.getRHS(), SubstIndex);
if (!RHS)
- return std::nullopt;
+ return nullptr;
- return NormalizedConstraint(S.Context, std::move(*LHS), std::move(*RHS),
- BO.isAnd() ? CCK_Conjunction : CCK_Disjunction);
+ return CompoundConstraint::Create(
+ S.Context, LHS, BO.isAnd() ? CCK_Conjunction : CCK_Disjunction, RHS);
} else if (auto *CSE = dyn_cast<const ConceptSpecializationExpr>(E)) {
- const NormalizedConstraint *SubNF;
+ NormalizedConstraint *SubNF;
{
Sema::InstantiatingTemplate Inst(
S, CSE->getExprLoc(),
@@ -1621,7 +2200,7 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D,
// FIXME: improve const-correctness of InstantiatingTemplate
const_cast<NamedDecl *>(D), CSE->getSourceRange());
if (Inst.isInvalid())
- return std::nullopt;
+ return nullptr;
// C++ [temp.constr.normal]p1.1
// [...]
// The normal form of an id-expression of the form C<A1, A2, ..., AN>,
@@ -1631,20 +2210,21 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D,
// constraint. If any such substitution results in an invalid type or
// expression, the program is ill-formed; no diagnostic is required.
// [...]
- ConceptDecl *CD = CSE->getNamedConcept();
- SubNF = S.getNormalizedAssociatedConstraints(
- CD, AssociatedConstraint(CD->getConstraintExpr()));
+
+ // Use canonical declarations to merge ConceptDecls across
+ // different modules.
+ ConceptDecl *CD = CSE->getNamedConcept()->getCanonicalDecl();
+ SubNF = NormalizedConstraint::fromAssociatedConstraints(
+ S, CD, AssociatedConstraint(CD->getConstraintExpr(), SubstIndex));
+
if (!SubNF)
- return std::nullopt;
+ return nullptr;
}
- std::optional<NormalizedConstraint> New;
- New.emplace(S.Context, *SubNF);
-
- if (substituteParameterMappings(S, *New, CSE))
- return std::nullopt;
+ return ConceptIdConstraint::Create(S.getASTContext(),
+ CSE->getConceptReference(), SubNF, D,
+ CSE, SubstIndex);
- return New;
} else if (auto *FE = dyn_cast<const CXXFoldExpr>(E);
FE && S.getLangOpts().CPlusPlus26 &&
(FE->getOperator() == BinaryOperatorKind::BO_LAnd ||
@@ -1658,31 +2238,61 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D,
: FoldExpandedConstraint::FoldOperatorKind::Or;
if (FE->getInit()) {
- auto LHS = fromConstraintExpr(S, D, FE->getLHS());
- auto RHS = fromConstraintExpr(S, D, FE->getRHS());
+ auto *LHS = fromConstraintExpr(S, D, FE->getLHS(), SubstIndex);
+ auto *RHS = fromConstraintExpr(S, D, FE->getRHS(), SubstIndex);
if (!LHS || !RHS)
- return std::nullopt;
+ return nullptr;
if (FE->isRightFold())
- RHS = NormalizedConstraint{new (S.Context) FoldExpandedConstraint{
- Kind, std::move(*RHS), FE->getPattern()}};
+ LHS = FoldExpandedConstraint::Create(S.getASTContext(),
+ FE->getPattern(), D, Kind, LHS);
else
- LHS = NormalizedConstraint{new (S.Context) FoldExpandedConstraint{
- Kind, std::move(*LHS), FE->getPattern()}};
-
- return NormalizedConstraint(
- S.Context, std::move(*LHS), std::move(*RHS),
- FE->getOperator() == BinaryOperatorKind::BO_LAnd ? CCK_Conjunction
- : CCK_Disjunction);
+ RHS = FoldExpandedConstraint::Create(S.getASTContext(),
+ FE->getPattern(), D, Kind, RHS);
+
+ return CompoundConstraint::Create(
+ S.getASTContext(), LHS,
+ (FE->getOperator() == BinaryOperatorKind::BO_LAnd ? CCK_Conjunction
+ : CCK_Disjunction),
+ RHS);
}
- auto Sub = fromConstraintExpr(S, D, FE->getPattern());
+ auto *Sub = fromConstraintExpr(S, D, FE->getPattern(), SubstIndex);
if (!Sub)
- return std::nullopt;
- return NormalizedConstraint{new (S.Context) FoldExpandedConstraint{
- Kind, std::move(*Sub), FE->getPattern()}};
+ return nullptr;
+ return FoldExpandedConstraint::Create(S.getASTContext(), FE->getPattern(),
+ D, Kind, Sub);
}
+ return AtomicConstraint::Create(S.getASTContext(), E, D, SubstIndex);
+}
- return NormalizedConstraint{new (S.Context) AtomicConstraint(E, D)};
+const NormalizedConstraint *Sema::getNormalizedAssociatedConstraints(
+ ConstrainedDeclOrNestedRequirement ConstrainedDeclOrNestedReq,
+ ArrayRef<AssociatedConstraint> AssociatedConstraints) {
+ if (!ConstrainedDeclOrNestedReq) {
+ auto *Normalized = NormalizedConstraint::fromAssociatedConstraints(
+ *this, nullptr, AssociatedConstraints);
+ if (!Normalized ||
+ SubstituteParameterMappings(*this).substitute(*Normalized))
+ return nullptr;
+
+ return Normalized;
+ }
+
+ // FIXME: ConstrainedDeclOrNestedReq is never a NestedRequirement!
+ const NamedDecl *ND =
+ ConstrainedDeclOrNestedReq.dyn_cast<const NamedDecl *>();
+ auto CacheEntry = NormalizationCache.find(ConstrainedDeclOrNestedReq);
+ if (CacheEntry == NormalizationCache.end()) {
+ auto *Normalized = NormalizedConstraint::fromAssociatedConstraints(
+ *this, ND, AssociatedConstraints);
+ CacheEntry =
+ NormalizationCache.try_emplace(ConstrainedDeclOrNestedReq, Normalized)
+ .first;
+ if (!Normalized ||
+ SubstituteParameterMappings(*this).substitute(*Normalized))
+ return nullptr;
+ }
+ return CacheEntry->second;
}
bool FoldExpandedConstraint::AreCompatibleForSubsumption(
@@ -1693,8 +2303,10 @@ bool FoldExpandedConstraint::AreCompatibleForSubsumption(
// if their respective constraints both contain an equivalent unexpanded pack.
llvm::SmallVector<UnexpandedParameterPack> APacks, BPacks;
- Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(A.Pattern), APacks);
- Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(B.Pattern), BPacks);
+ Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(A.getPattern()),
+ APacks);
+ Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(B.getPattern()),
+ BPacks);
for (const UnexpandedParameterPack &APack : APacks) {
auto ADI = getDepthAndIndex(APack);
@@ -1788,7 +2400,7 @@ bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic(
const AtomicConstraint &B) {
if (!A.hasMatchingParameterMapping(Context, B))
return false;
- const Expr *EA = A.ConstraintExpr, *EB = B.ConstraintExpr;
+ const Expr *EA = A.getConstraintExpr(), *EB = B.getConstraintExpr();
if (EA == EB)
return true;
@@ -1841,24 +2453,6 @@ bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic(
return true;
}
-NormalizedConstraint::CompoundConstraintKind
-NormalizedConstraint::getCompoundKind() const {
- assert(isCompound() && "getCompoundKind on a non-compound constraint..");
- return cast<CompoundConstraint>(Constraint).getInt();
-}
-
-AtomicConstraint *NormalizedConstraint::getAtomicConstraint() const {
- assert(isAtomic() && "getAtomicConstraint called on non-atomic constraint.");
- return cast<AtomicConstraint *>(Constraint);
-}
-
-FoldExpandedConstraint *
-NormalizedConstraint::getFoldExpandedConstraint() const {
- assert(isFoldExpanded() &&
- "getFoldExpandedConstraint called on non-fold-expanded constraint.");
- return cast<FoldExpandedConstraint *>(Constraint);
-}
-
//
//
// ------------------------ Subsumption -----------------------------------
@@ -1874,8 +2468,8 @@ uint16_t SubsumptionChecker::getNewLiteralId() {
return NextID++;
}
-auto SubsumptionChecker::find(AtomicConstraint *Ori) -> Literal {
- auto &Elems = AtomicMap[Ori->ConstraintExpr];
+auto SubsumptionChecker::find(const AtomicConstraint *Ori) -> Literal {
+ auto &Elems = AtomicMap[Ori->getConstraintExpr()];
// C++ [temp.constr.order] p2
// - an atomic constraint A subsumes another atomic constraint B
// if and only if the A and B are identical [...]
@@ -1891,13 +2485,16 @@ auto SubsumptionChecker::find(AtomicConstraint *Ori) -> Literal {
// subsumes another, their literal will be the same
llvm::FoldingSetNodeID ID;
- const auto &Mapping = Ori->ParameterMapping;
- ID.AddBoolean(Mapping.has_value());
- if (Mapping) {
- for (const TemplateArgumentLoc &TAL : *Mapping) {
- SemaRef.getASTContext()
- .getCanonicalTemplateArgument(TAL.getArgument())
- .Profile(ID, SemaRef.getASTContext());
+ ID.AddBoolean(Ori->hasParameterMapping());
+ if (Ori->hasParameterMapping()) {
+ const auto &Mapping = Ori->getParameterMapping();
+ const NormalizedConstraint::OccurenceList &Indexes =
+ Ori->mappingOccurenceListForSubsumption();
+ for (auto [Idx, TAL] : llvm::enumerate(Mapping)) {
+ if (Indexes[Idx])
+ SemaRef.getASTContext()
+ .getCanonicalTemplateArgument(TAL.getArgument())
+ .Profile(ID, SemaRef.getASTContext());
}
}
auto It = Elems.find(ID);
@@ -1912,11 +2509,11 @@ auto SubsumptionChecker::find(AtomicConstraint *Ori) -> Literal {
return It->getSecond().ID;
}
-auto SubsumptionChecker::find(FoldExpandedConstraint *Ori) -> Literal {
- auto &Elems = FoldMap[Ori->Pattern];
+auto SubsumptionChecker::find(const FoldExpandedConstraint *Ori) -> Literal {
+ auto &Elems = FoldMap[Ori->getPattern()];
FoldExpendedConstraintKey K;
- K.Kind = Ori->Kind;
+ K.Kind = Ori->getFoldOperator();
auto It = llvm::find_if(Elems, [&K](const FoldExpendedConstraintKey &Other) {
return K.Kind == Other.Kind;
@@ -1960,38 +2557,47 @@ FormulaType SubsumptionChecker::Normalize(const NormalizedConstraint &NC) {
AddUniqueClauseToFormula(Res, std::move(C));
};
- if (NC.isAtomic())
- return {{find(NC.getAtomicConstraint())}};
+ switch (NC.getKind()) {
- if (NC.isFoldExpanded())
- return {{find(NC.getFoldExpandedConstraint())}};
+ case NormalizedConstraint::ConstraintKind::Atomic:
+ return {{find(&static_cast<const AtomicConstraint &>(NC))}};
- FormulaType Left, Right;
- SemaRef.runWithSufficientStackSpace(SourceLocation(), [&] {
- Left = Normalize<FormulaType>(NC.getLHS());
- Right = Normalize<FormulaType>(NC.getRHS());
- });
+ case NormalizedConstraint::ConstraintKind::FoldExpanded:
+ return {{find(&static_cast<const FoldExpandedConstraint &>(NC))}};
- if (NC.getCompoundKind() == FormulaType::Kind) {
- auto SizeLeft = Left.size();
- Res = std::move(Left);
- Res.reserve(SizeLeft + Right.size());
- std::for_each(std::make_move_iterator(Right.begin()),
- std::make_move_iterator(Right.end()), Add);
- return Res;
- }
+ case NormalizedConstraint::ConstraintKind::ConceptId:
+ return Normalize<FormulaType>(
+ static_cast<const ConceptIdConstraint &>(NC).getNormalizedConstraint());
+
+ case NormalizedConstraint::ConstraintKind::Compound: {
+ const auto &Compound = static_cast<const CompoundConstraint &>(NC);
+ FormulaType Left, Right;
+ SemaRef.runWithSufficientStackSpace(SourceLocation(), [&] {
+ Left = Normalize<FormulaType>(Compound.getLHS());
+ Right = Normalize<FormulaType>(Compound.getRHS());
+ });
+
+ if (Compound.getCompoundKind() == FormulaType::Kind) {
+ Res = std::move(Left);
+ Res.reserve(Left.size() + Right.size());
+ std::for_each(std::make_move_iterator(Right.begin()),
+ std::make_move_iterator(Right.end()), Add);
+ return Res;
+ }
- Res.reserve(Left.size() * Right.size());
- for (const auto &LTransform : Left) {
- for (const auto &RTransform : Right) {
- Clause Combined;
- Combined.reserve(LTransform.size() + RTransform.size());
- llvm::append_range(Combined, LTransform);
- llvm::append_range(Combined, RTransform);
- Add(std::move(Combined));
+ Res.reserve(Left.size() * Right.size());
+ for (const auto &LTransform : Left) {
+ for (const auto &RTransform : Right) {
+ Clause Combined;
+ Combined.reserve(LTransform.size() + RTransform.size());
+ llvm::copy(LTransform, std::back_inserter(Combined));
+ llvm::copy(RTransform, std::back_inserter(Combined));
+ Add(std::move(Combined));
+ }
}
+ return Res;
+ }
}
- return Res;
}
void SubsumptionChecker::AddUniqueClauseToFormula(Formula &F, Clause C) {
@@ -2006,12 +2612,12 @@ std::optional<bool> SubsumptionChecker::Subsumes(
const NamedDecl *DP, ArrayRef<AssociatedConstraint> P, const NamedDecl *DQ,
ArrayRef<AssociatedConstraint> Q) {
const NormalizedConstraint *PNormalized =
- getNormalizedAssociatedConstraints(SemaRef, DP, P);
+ SemaRef.getNormalizedAssociatedConstraints(DP, P);
if (!PNormalized)
return std::nullopt;
const NormalizedConstraint *QNormalized =
- getNormalizedAssociatedConstraints(SemaRef, DQ, Q);
+ SemaRef.getNormalizedAssociatedConstraints(DQ, Q);
if (!QNormalized)
return std::nullopt;
@@ -2061,9 +2667,9 @@ bool SubsumptionChecker::Subsumes(const FoldExpandedConstraint *A,
// constraint B if they are compatible for subsumption, have the same
// fold-operator, and the constraint of A subsumes that of B.
bool DoesSubsume =
- A->Kind == B->Kind &&
+ A->getFoldOperator() == B->getFoldOperator() &&
FoldExpandedConstraint::AreCompatibleForSubsumption(*A, *B) &&
- Subsumes(&A->Constraint, &B->Constraint);
+ Subsumes(&A->getNormalizedPattern(), &B->getNormalizedPattern());
It = FoldSubsumptionCache.try_emplace(std::move(Key), DoesSubsume).first;
}
return It->second;
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 16d42d2..d27f767 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -17876,13 +17876,15 @@ Decl *Sema::BuildStaticAssertDeclaration(SourceLocation StaticAssertLoc,
findFailedBooleanCondition(Converted.get());
if (const auto *ConceptIDExpr =
dyn_cast_or_null<ConceptSpecializationExpr>(InnerCond)) {
- // Drill down into concept specialization expressions to see why they
- // weren't satisfied.
- Diag(AssertExpr->getBeginLoc(), diag::err_static_assert_failed)
- << !HasMessage << Msg.str() << AssertExpr->getSourceRange();
- ConstraintSatisfaction Satisfaction;
- if (!CheckConstraintSatisfaction(ConceptIDExpr, Satisfaction))
- DiagnoseUnsatisfiedConstraint(Satisfaction);
+ const ASTConstraintSatisfaction &Satisfaction =
+ ConceptIDExpr->getSatisfaction();
+ if (!Satisfaction.ContainsErrors || Satisfaction.NumRecords) {
+ Diag(AssertExpr->getBeginLoc(), diag::err_static_assert_failed)
+ << !HasMessage << Msg.str() << AssertExpr->getSourceRange();
+ // Drill down into concept specialization expressions to see why they
+ // weren't satisfied.
+ DiagnoseUnsatisfiedConstraint(ConceptIDExpr);
+ }
} else if (InnerCond && !isa<CXXBoolLiteralExpr>(InnerCond) &&
!isa<IntegerLiteral>(InnerCond)) {
Diag(InnerCond->getBeginLoc(),
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 576eb32..0fe242dce 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -7935,21 +7935,27 @@ Sema::BuildExprRequirement(
// be satisfied.
TemplateParameterList *TPL =
ReturnTypeRequirement.getTypeConstraintTemplateParameterList();
- QualType MatchedType =
- Context.getReferenceQualifiedType(E).getCanonicalType();
+ QualType MatchedType = Context.getReferenceQualifiedType(E);
llvm::SmallVector<TemplateArgument, 1> Args;
Args.push_back(TemplateArgument(MatchedType));
auto *Param = cast<TemplateTypeParmDecl>(TPL->getParam(0));
- MultiLevelTemplateArgumentList MLTAL(Param, Args, /*Final=*/false);
+ MultiLevelTemplateArgumentList MLTAL(Param, Args, /*Final=*/true);
MLTAL.addOuterRetainedLevels(TPL->getDepth());
const TypeConstraint *TC = Param->getTypeConstraint();
assert(TC && "Type Constraint cannot be null here");
auto *IDC = TC->getImmediatelyDeclaredConstraint();
assert(IDC && "ImmediatelyDeclaredConstraint can't be null here.");
ExprResult Constraint = SubstExpr(IDC, MLTAL);
- if (Constraint.isInvalid()) {
+ bool HasError = Constraint.isInvalid();
+ if (!HasError) {
+ SubstitutedConstraintExpr =
+ cast<ConceptSpecializationExpr>(Constraint.get());
+ if (SubstitutedConstraintExpr->getSatisfaction().ContainsErrors)
+ HasError = true;
+ }
+ if (HasError) {
return new (Context) concepts::ExprRequirement(
createSubstDiagAt(IDC->getExprLoc(),
[&](llvm::raw_ostream &OS) {
@@ -7958,8 +7964,6 @@ Sema::BuildExprRequirement(
}),
IsSimple, NoexceptLoc, ReturnTypeRequirement);
}
- SubstitutedConstraintExpr =
- cast<ConceptSpecializationExpr>(Constraint.get());
if (!SubstitutedConstraintExpr->isSatisfied())
Status = concepts::ExprRequirement::SS_ConstraintsNotSatisfied;
}
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index c971293..0d0d2c0 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -8219,8 +8219,8 @@ ExprResult InitializationSequence::Perform(Sema &S,
// InitializeTemporary entity for our target type.
QualType Ty = Step->Type;
bool IsTemporary = !S.Context.hasSameType(Entity.getType(), Ty);
- InitializedEntity TempEntity = InitializedEntity::InitializeTemporary(Ty);
- InitializedEntity InitEntity = IsTemporary ? TempEntity : Entity;
+ InitializedEntity InitEntity =
+ IsTemporary ? InitializedEntity::InitializeTemporary(Ty) : Entity;
InitListChecker PerformInitList(S, InitEntity,
InitList, Ty, /*VerifyOnly=*/false,
/*TreatUnavailableAsInvalid=*/false);
@@ -8242,7 +8242,6 @@ ExprResult InitializationSequence::Perform(Sema &S,
InitListExpr *StructuredInitList =
PerformInitList.getFullyStructuredList();
- CurInit.get();
CurInit = shouldBindAsTemporary(InitEntity)
? S.MaybeBindToTemporary(StructuredInitList)
: StructuredInitList;
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index ea5c4265..b870114 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -804,7 +804,7 @@ clang::MakeDeductionFailureInfo(ASTContext &Context,
case TemplateDeductionResult::ConstraintsNotSatisfied: {
CNSInfo *Saved = new (Context) CNSInfo;
Saved->TemplateArgs = Info.takeSugared();
- Saved->Satisfaction = Info.AssociatedConstraintsSatisfaction;
+ Saved->Satisfaction = std::move(Info.AssociatedConstraintsSatisfaction);
Result.Data = Saved;
break;
}
@@ -852,6 +852,7 @@ void DeductionFailureInfo::Destroy() {
case TemplateDeductionResult::ConstraintsNotSatisfied:
// FIXME: Destroy the template argument list?
+ static_cast<CNSInfo *>(Data)->Satisfaction.~ConstraintSatisfaction();
Data = nullptr;
if (PartialDiagnosticAt *Diag = getSFINAEDiagnostic()) {
Diag->~PartialDiagnosticAt();
@@ -12739,7 +12740,8 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
<< (unsigned)FnKindPair.first << (unsigned)ocs_non_template
<< FnDesc /* Ignored */;
ConstraintSatisfaction Satisfaction;
- if (S.CheckFunctionConstraints(Fn, Satisfaction))
+ if (S.CheckFunctionConstraints(Fn, Satisfaction, SourceLocation(),
+ /*ForOverloadResolution=*/true))
break;
S.DiagnoseUnsatisfiedConstraint(Satisfaction);
}
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 2bf1511..dcf2876 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -9,6 +9,7 @@
//===----------------------------------------------------------------------===//
#include "TreeTransform.h"
+#include "clang/AST/ASTConcept.h"
#include "clang/AST/ASTConsumer.h"
#include "clang/AST/ASTContext.h"
#include "clang/AST/Decl.h"
@@ -1222,8 +1223,9 @@ static ExprResult formImmediatelyDeclaredConstraint(
if (auto *CD = dyn_cast<ConceptDecl>(NamedConcept)) {
ImmediatelyDeclaredConstraint = S.CheckConceptTemplateId(
SS, /*TemplateKWLoc=*/SourceLocation(), NameInfo,
- /*FoundDecl=*/FoundDecl ? FoundDecl : NamedConcept, CD,
- &ConstraintArgs);
+ /*FoundDecl=*/FoundDecl ? FoundDecl : CD, CD, &ConstraintArgs,
+ /*DoCheckConstraintSatisfaction=*/
+ !S.inParameterMappingSubstitution());
}
// We have a template template parameter
else {
@@ -4850,13 +4852,11 @@ void Sema::diagnoseMissingTemplateArguments(const CXXScopeSpec &SS,
diagnoseMissingTemplateArguments(Name, Loc);
}
-ExprResult
-Sema::CheckConceptTemplateId(const CXXScopeSpec &SS,
- SourceLocation TemplateKWLoc,
- const DeclarationNameInfo &ConceptNameInfo,
- NamedDecl *FoundDecl,
- ConceptDecl *NamedConcept,
- const TemplateArgumentListInfo *TemplateArgs) {
+ExprResult Sema::CheckConceptTemplateId(
+ const CXXScopeSpec &SS, SourceLocation TemplateKWLoc,
+ const DeclarationNameInfo &ConceptNameInfo, NamedDecl *FoundDecl,
+ TemplateDecl *NamedConcept, const TemplateArgumentListInfo *TemplateArgs,
+ bool DoCheckConstraintSatisfaction) {
assert(NamedConcept && "A concept template id without a template?");
if (NamedConcept->isInvalidDecl())
@@ -4873,33 +4873,48 @@ Sema::CheckConceptTemplateId(const CXXScopeSpec &SS,
DiagnoseUseOfDecl(NamedConcept, ConceptNameInfo.getLoc());
+ // There's a bug with CTAI.CanonicalConverted.
+ // If the template argument contains a DependentDecltypeType that includes a
+ // TypeAliasType, and the same written type had occurred previously in the
+ // source, then the DependentDecltypeType would be canonicalized to that
+ // previous type which would mess up the substitution.
+ // FIXME: Reland https://github.com/llvm/llvm-project/pull/101782 properly!
auto *CSD = ImplicitConceptSpecializationDecl::Create(
Context, NamedConcept->getDeclContext(), NamedConcept->getLocation(),
- CTAI.CanonicalConverted);
+ CTAI.SugaredConverted);
ConstraintSatisfaction Satisfaction;
bool AreArgsDependent =
TemplateSpecializationType::anyDependentTemplateArguments(
- *TemplateArgs, CTAI.CanonicalConverted);
- MultiLevelTemplateArgumentList MLTAL(NamedConcept, CTAI.CanonicalConverted,
+ *TemplateArgs, CTAI.SugaredConverted);
+ MultiLevelTemplateArgumentList MLTAL(NamedConcept, CTAI.SugaredConverted,
/*Final=*/false);
- LocalInstantiationScope Scope(*this);
-
- EnterExpressionEvaluationContext EECtx{
- *this, ExpressionEvaluationContext::Unevaluated, CSD};
-
- if (!AreArgsDependent &&
- CheckConstraintSatisfaction(
- NamedConcept, AssociatedConstraint(NamedConcept->getConstraintExpr()),
- MLTAL,
- SourceRange(SS.isSet() ? SS.getBeginLoc() : ConceptNameInfo.getLoc(),
- TemplateArgs->getRAngleLoc()),
- Satisfaction))
- return ExprError();
auto *CL = ConceptReference::Create(
Context,
SS.isSet() ? SS.getWithLocInContext(Context) : NestedNameSpecifierLoc{},
TemplateKWLoc, ConceptNameInfo, FoundDecl, NamedConcept,
ASTTemplateArgumentListInfo::Create(Context, *TemplateArgs));
+
+ bool Error = false;
+ if (const auto *Concept = dyn_cast<ConceptDecl>(NamedConcept);
+ Concept && Concept->getConstraintExpr() && !AreArgsDependent &&
+ DoCheckConstraintSatisfaction) {
+
+ LocalInstantiationScope Scope(*this);
+
+ EnterExpressionEvaluationContext EECtx{
+ *this, ExpressionEvaluationContext::Unevaluated, CSD};
+
+ Error = CheckConstraintSatisfaction(
+ NamedConcept, AssociatedConstraint(Concept->getConstraintExpr()), MLTAL,
+ SourceRange(SS.isSet() ? SS.getBeginLoc() : ConceptNameInfo.getLoc(),
+ TemplateArgs->getRAngleLoc()),
+ Satisfaction, CL);
+ Satisfaction.ContainsErrors = Error;
+ }
+
+ if (Error)
+ return ExprError();
+
return ConceptSpecializationExpr::Create(
Context, CL, CSD, AreArgsDependent ? nullptr : &Satisfaction);
}
@@ -5217,10 +5232,11 @@ bool Sema::CheckTemplateTypeArgument(
}
default: {
// We allow instantiating a template with template argument packs when
- // building deduction guides.
+ // building deduction guides or mapping constraint template parameters.
if (Arg.getKind() == TemplateArgument::Pack &&
- CodeSynthesisContexts.back().Kind ==
- Sema::CodeSynthesisContext::BuildingDeductionGuides) {
+ (CodeSynthesisContexts.back().Kind ==
+ Sema::CodeSynthesisContext::BuildingDeductionGuides ||
+ inParameterMappingSubstitution())) {
SugaredConverted.push_back(Arg);
CanonicalConverted.push_back(Arg);
return false;
@@ -5813,6 +5829,20 @@ bool Sema::CheckTemplateArgumentList(
TemplateArgumentListInfo &TemplateArgs, const DefaultArguments &DefaultArgs,
bool PartialTemplateArgs, CheckTemplateArgumentInfo &CTAI,
bool UpdateArgsWithConversions, bool *ConstraintsNotSatisfied) {
+ return CheckTemplateArgumentList(
+ Template, GetTemplateParameterList(Template), TemplateLoc, TemplateArgs,
+ DefaultArgs, PartialTemplateArgs, CTAI, UpdateArgsWithConversions,
+ ConstraintsNotSatisfied);
+}
+
+/// Check that the given template argument list is well-formed
+/// for specializing the given template.
+bool Sema::CheckTemplateArgumentList(
+ TemplateDecl *Template, TemplateParameterList *Params,
+ SourceLocation TemplateLoc, TemplateArgumentListInfo &TemplateArgs,
+ const DefaultArguments &DefaultArgs, bool PartialTemplateArgs,
+ CheckTemplateArgumentInfo &CTAI, bool UpdateArgsWithConversions,
+ bool *ConstraintsNotSatisfied) {
if (ConstraintsNotSatisfied)
*ConstraintsNotSatisfied = false;
@@ -5822,8 +5852,6 @@ bool Sema::CheckTemplateArgumentList(
// template.
TemplateArgumentListInfo NewArgs = TemplateArgs;
- TemplateParameterList *Params = GetTemplateParameterList(Template);
-
SourceLocation RAngleLoc = NewArgs.getRAngleLoc();
// C++23 [temp.arg.general]p1:
@@ -6163,11 +6191,12 @@ bool Sema::CheckTemplateArgumentList(
CXXThisScopeRAII Scope(*this, RD, ThisQuals, RD != nullptr);
MultiLevelTemplateArgumentList MLTAL = getTemplateInstantiationArgs(
- Template, NewContext, /*Final=*/false, CTAI.CanonicalConverted,
+ Template, NewContext, /*Final=*/true, CTAI.SugaredConverted,
/*RelativeToPrimary=*/true,
/*Pattern=*/nullptr,
/*ForConceptInstantiation=*/true);
- if (EnsureTemplateArgumentListConstraints(
+ if (!isa<ConceptDecl>(Template) &&
+ EnsureTemplateArgumentListConstraints(
Template, MLTAL,
SourceRange(TemplateLoc, TemplateArgs.getRAngleLoc()))) {
if (ConstraintsNotSatisfied)
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index f6ee745..6bba505 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3206,7 +3206,7 @@ CheckDeducedArgumentConstraints(Sema &S, NamedDecl *Template,
// If we don't need to replace the deduced template arguments,
// we can add them immediately as the inner-most argument list.
if (!DeducedArgsNeedReplacement)
- Innermost = CanonicalDeducedArgs;
+ Innermost = SugaredDeducedArgs;
MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
Template, Template->getDeclContext(), /*Final=*/false, Innermost,
@@ -3218,7 +3218,7 @@ CheckDeducedArgumentConstraints(Sema &S, NamedDecl *Template,
// not class-scope explicit specialization, so replace with Deduced Args
// instead of adding to inner-most.
if (!Innermost)
- MLTAL.replaceInnermostTemplateArguments(Template, CanonicalDeducedArgs);
+ MLTAL.replaceInnermostTemplateArguments(Template, SugaredDeducedArgs);
if (S.CheckConstraintSatisfaction(Template, AssociatedConstraints, MLTAL,
Info.getLocation(),
@@ -3995,11 +3995,12 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
if (CheckFunctionTemplateConstraints(
Info.getLocation(),
FunctionTemplate->getCanonicalDecl()->getTemplatedDecl(),
- CTAI.CanonicalConverted, Info.AssociatedConstraintsSatisfaction))
+ CTAI.SugaredConverted, Info.AssociatedConstraintsSatisfaction))
return TemplateDeductionResult::MiscellaneousDeductionFailure;
if (!Info.AssociatedConstraintsSatisfaction.IsSatisfied) {
- Info.reset(Info.takeSugared(), TemplateArgumentList::CreateCopy(
- Context, CTAI.CanonicalConverted));
+ Info.reset(
+ TemplateArgumentList::CreateCopy(Context, CTAI.SugaredConverted),
+ Info.takeCanonical());
return TemplateDeductionResult::ConstraintsNotSatisfied;
}
}
@@ -5167,8 +5168,8 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type,
/*DefaultArgs=*/{},
/*PartialTemplateArgs=*/false, CTAI))
return true;
- MultiLevelTemplateArgumentList MLTAL(Concept, CTAI.CanonicalConverted,
- /*Final=*/false);
+ MultiLevelTemplateArgumentList MLTAL(Concept, CTAI.SugaredConverted,
+ /*Final=*/true);
// Build up an EvaluationContext with an ImplicitConceptSpecializationDecl so
// that the template arguments of the constraint can be preserved. For
// example:
@@ -5182,7 +5183,7 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type,
S, Sema::ExpressionEvaluationContext::Unevaluated,
ImplicitConceptSpecializationDecl::Create(
S.getASTContext(), Concept->getDeclContext(), Concept->getLocation(),
- CTAI.CanonicalConverted));
+ CTAI.SugaredConverted));
if (S.CheckConstraintSatisfaction(
Concept, AssociatedConstraint(Concept->getConstraintExpr()), MLTAL,
TypeLoc.getLocalSourceRange(), Satisfaction))
@@ -6676,10 +6677,11 @@ namespace {
struct MarkUsedTemplateParameterVisitor : DynamicRecursiveASTVisitor {
llvm::SmallBitVector &Used;
unsigned Depth;
+ bool VisitDeclRefTypes = true;
- MarkUsedTemplateParameterVisitor(llvm::SmallBitVector &Used,
- unsigned Depth)
- : Used(Used), Depth(Depth) { }
+ MarkUsedTemplateParameterVisitor(llvm::SmallBitVector &Used, unsigned Depth,
+ bool VisitDeclRefTypes = true)
+ : Used(Used), Depth(Depth), VisitDeclRefTypes(VisitDeclRefTypes) {}
bool VisitTemplateTypeParmType(TemplateTypeParmType *T) override {
if (T->getDepth() == Depth)
@@ -6700,6 +6702,8 @@ struct MarkUsedTemplateParameterVisitor : DynamicRecursiveASTVisitor {
if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(E->getDecl()))
if (NTTP->getDepth() == Depth)
Used[NTTP->getIndex()] = true;
+ if (VisitDeclRefTypes)
+ DynamicRecursiveASTVisitor::TraverseType(E->getType());
return true;
}
@@ -7043,10 +7047,13 @@ MarkUsedTemplateParameters(ASTContext &Ctx, QualType T,
break;
case Type::UnaryTransform:
- if (!OnlyDeduced)
- MarkUsedTemplateParameters(Ctx,
- cast<UnaryTransformType>(T)->getUnderlyingType(),
- OnlyDeduced, Depth, Used);
+ if (!OnlyDeduced) {
+ auto *UTT = cast<UnaryTransformType>(T);
+ auto Next = UTT->getUnderlyingType();
+ if (Next.isNull())
+ Next = UTT->getBaseType();
+ MarkUsedTemplateParameters(Ctx, Next, OnlyDeduced, Depth, Used);
+ }
break;
case Type::PackExpansion:
@@ -7146,6 +7153,12 @@ Sema::MarkUsedTemplateParameters(const Expr *E, bool OnlyDeduced,
::MarkUsedTemplateParameters(Context, E, OnlyDeduced, Depth, Used);
}
+void Sema::MarkUsedTemplateParametersForSubsumptionParameterMapping(
+ const Expr *E, unsigned Depth, llvm::SmallBitVector &Used) {
+ MarkUsedTemplateParameterVisitor(Used, Depth, /*VisitDeclRefTypes=*/false)
+ .TraverseStmt(const_cast<Expr *>(E));
+}
+
void
Sema::MarkUsedTemplateParameters(const TemplateArgumentList &TemplateArgs,
bool OnlyDeduced, unsigned Depth,
@@ -7171,6 +7184,14 @@ void Sema::MarkUsedTemplateParameters(ArrayRef<TemplateArgument> TemplateArgs,
/*OnlyDeduced=*/false, Depth, Used);
}
+void Sema::MarkUsedTemplateParameters(
+ ArrayRef<TemplateArgumentLoc> TemplateArgs, unsigned Depth,
+ llvm::SmallBitVector &Used) {
+ for (unsigned I = 0, N = TemplateArgs.size(); I != N; ++I)
+ ::MarkUsedTemplateParameters(Context, TemplateArgs[I].getArgument(),
+ /*OnlyDeduced=*/false, Depth, Used);
+}
+
void Sema::MarkDeducedTemplateParameters(
ASTContext &Ctx, const FunctionTemplateDecl *FunctionTemplate,
llvm::SmallBitVector &Deduced) {
diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index fe673ea..9a61888 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -1171,17 +1171,46 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
Args.addOuterTemplateArguments(TransformedDeducedAliasArgs);
for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) {
const auto &D = DeduceResults[Index];
+ auto *TP = F->getTemplateParameters()->getParam(Index);
if (IsNonDeducedArgument(D)) {
// 2): Non-deduced template parameters would be substituted later.
continue;
}
TemplateArgumentLoc Input =
SemaRef.getTrivialTemplateArgumentLoc(D, QualType(), SourceLocation{});
- TemplateArgumentLoc Output;
- if (!SemaRef.SubstTemplateArgument(Input, Args, Output)) {
- assert(TemplateArgsForBuildingFPrime[Index].isNull() &&
- "InstantiatedArgs must be null before setting");
- TemplateArgsForBuildingFPrime[Index] = Output.getArgument();
+ TemplateArgumentListInfo Output;
+ if (SemaRef.SubstTemplateArguments(Input, Args, Output))
+ return nullptr;
+ assert(TemplateArgsForBuildingFPrime[Index].isNull() &&
+ "InstantiatedArgs must be null before setting");
+ // CheckTemplateArgument is necessary for NTTP initializations.
+ // FIXME: We may want to call CheckTemplateArguments instead, but we cannot
+ // match packs as usual, since packs can appear in the middle of the
+ // parameter list of a synthesized CTAD guide. See also the FIXME in
+ // test/SemaCXX/cxx20-ctad-type-alias.cpp:test25.
+ Sema::CheckTemplateArgumentInfo CTAI;
+ if (Input.getArgument().getKind() == TemplateArgument::Pack) {
+ for (auto TA : Output.arguments()) {
+ if (SemaRef.CheckTemplateArgument(
+ TP, TA, F, F->getLocation(), F->getLocation(),
+ /*ArgumentPackIndex=*/-1, CTAI,
+ Sema::CheckTemplateArgumentKind::CTAK_Specified))
+ return nullptr;
+ }
+ // We will substitute the non-deduced template arguments with these
+ // transformed (unpacked at this point) arguments, where that substitution
+ // requires a pack for the corresponding parameter packs.
+ TemplateArgsForBuildingFPrime[Index] =
+ TemplateArgument::CreatePackCopy(Context, CTAI.SugaredConverted);
+ } else {
+ assert(Output.arguments().size() == 1);
+ TemplateArgumentLoc Transformed = Output.arguments()[0];
+ if (SemaRef.CheckTemplateArgument(
+ TP, Transformed, F, F->getLocation(), F->getLocation(),
+ /*ArgumentPackIndex=*/-1, CTAI,
+ Sema::CheckTemplateArgumentKind::CTAK_Specified))
+ return nullptr;
+ TemplateArgsForBuildingFPrime[Index] = CTAI.SugaredConverted[0];
}
}
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index f1c9c5c..1f762ca 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -628,9 +628,14 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
Inst.InstantiationRange = InstantiationRange;
Inst.InConstraintSubstitution =
Inst.Kind == CodeSynthesisContext::ConstraintSubstitution;
- if (!SemaRef.CodeSynthesisContexts.empty())
+ Inst.InParameterMappingSubstitution =
+ Inst.Kind == CodeSynthesisContext::ParameterMappingSubstitution;
+ if (!SemaRef.CodeSynthesisContexts.empty()) {
Inst.InConstraintSubstitution |=
SemaRef.CodeSynthesisContexts.back().InConstraintSubstitution;
+ Inst.InParameterMappingSubstitution |=
+ SemaRef.CodeSynthesisContexts.back().InParameterMappingSubstitution;
+ }
Invalid = SemaRef.pushCodeSynthesisContext(Inst);
if (!Invalid) {
@@ -1375,6 +1380,7 @@ std::optional<TemplateDeductionInfo *> Sema::isSFINAEContext() const {
// Template Instantiation for Types
//===----------------------------------------------------------------------===/
namespace {
+
class TemplateInstantiator : public TreeTransform<TemplateInstantiator> {
const MultiLevelTemplateArgumentList &TemplateArgs;
SourceLocation Loc;
@@ -1387,7 +1393,11 @@ namespace {
// Whether an incomplete substituion should be treated as an error.
bool BailOutOnIncomplete;
- private:
+ // Whether to rebuild pack expansion types; We don't do that when
+ // rebuilding the parameter mapping of a fold expression appearing
+ // in a constraint expression.
+ bool BuildPackExpansionTypes = true;
+
// CWG2770: Function parameters should be instantiated when they are
// needed by a satisfaction check of an atomic constraint or
// (recursively) by another function parameter.
@@ -1410,6 +1420,17 @@ namespace {
return EvaluateConstraints;
}
+ inline static struct ForParameterMappingSubstitution_t {
+ } ForParameterMappingSubstitution;
+
+ TemplateInstantiator(ForParameterMappingSubstitution_t, Sema &SemaRef,
+ SourceLocation Loc,
+ const MultiLevelTemplateArgumentList &TemplateArgs,
+ bool BuildPackExpansionTypes)
+ : inherited(SemaRef), TemplateArgs(TemplateArgs), Loc(Loc),
+ BailOutOnIncomplete(false),
+ BuildPackExpansionTypes(BuildPackExpansionTypes) {}
+
/// Determine whether the given type \p T has already been
/// transformed.
///
@@ -1444,7 +1465,8 @@ namespace {
bool &ShouldExpand, bool &RetainExpansion,
UnsignedOrNone &NumExpansions) {
if (SemaRef.CurrentInstantiationScope &&
- SemaRef.inConstraintSubstitution()) {
+ (SemaRef.inConstraintSubstitution() ||
+ SemaRef.inParameterMappingSubstitution())) {
for (UnexpandedParameterPack ParmPack : Unexpanded) {
NamedDecl *VD = ParmPack.first.dyn_cast<NamedDecl *>();
if (auto *PVD = dyn_cast_if_present<ParmVarDecl>(VD);
@@ -1465,10 +1487,10 @@ namespace {
TemplateArgument ForgetPartiallySubstitutedPack() {
TemplateArgument Result;
- if (NamedDecl *PartialPack
- = SemaRef.CurrentInstantiationScope->getPartiallySubstitutedPack()){
- MultiLevelTemplateArgumentList &TemplateArgs
- = const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs);
+ if (NamedDecl *PartialPack = SemaRef.CurrentInstantiationScope
+ ->getPartiallySubstitutedPack()) {
+ MultiLevelTemplateArgumentList &TemplateArgs =
+ const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs);
unsigned Depth, Index;
std::tie(Depth, Index) = getDepthAndIndex(PartialPack);
if (TemplateArgs.hasTemplateArgument(Depth, Index)) {
@@ -1488,10 +1510,10 @@ namespace {
if (Arg.isNull())
return;
- if (NamedDecl *PartialPack
- = SemaRef.CurrentInstantiationScope->getPartiallySubstitutedPack()){
- MultiLevelTemplateArgumentList &TemplateArgs
- = const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs);
+ if (NamedDecl *PartialPack = SemaRef.CurrentInstantiationScope
+ ->getPartiallySubstitutedPack()) {
+ MultiLevelTemplateArgumentList &TemplateArgs =
+ const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs);
unsigned Depth, Index;
std::tie(Depth, Index) = getDepthAndIndex(PartialPack);
TemplateArgs.setArgument(Depth, Index, Arg);
@@ -1508,9 +1530,9 @@ namespace {
std::move(New);
return Old;
}
+
void RememberSubstitution(MultiLevelTemplateArgumentList Old) {
- const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs) =
- std::move(Old);
+ const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs) = Old;
}
TemplateArgument
@@ -1691,6 +1713,24 @@ namespace {
return inherited::TransformTemplateArgument(Input, Output, Uneval);
}
+ // This has to be here to allow its overload.
+ ExprResult RebuildPackExpansion(Expr *Pattern, SourceLocation EllipsisLoc,
+ UnsignedOrNone NumExpansions) {
+ return inherited::RebuildPackExpansion(Pattern, EllipsisLoc,
+ NumExpansions);
+ }
+
+ TemplateArgumentLoc RebuildPackExpansion(TemplateArgumentLoc Pattern,
+ SourceLocation EllipsisLoc,
+ UnsignedOrNone NumExpansions) {
+ // We don't rewrite a PackExpansion type when we want to normalize a
+ // CXXFoldExpr constraint. We'll expand it when evaluating the constraint.
+ if (BuildPackExpansionTypes)
+ return inherited::RebuildPackExpansion(Pattern, EllipsisLoc,
+ NumExpansions);
+ return Pattern;
+ }
+
using TreeTransform::TransformTemplateSpecializationType;
QualType
TransformTemplateSpecializationType(TypeLocBuilder &TLB,
@@ -1961,7 +2001,8 @@ Decl *TemplateInstantiator::TransformDecl(SourceLocation Loc, Decl *D) {
if (ParmVarDecl *PVD = dyn_cast<ParmVarDecl>(D);
PVD && SemaRef.CurrentInstantiationScope &&
- SemaRef.inConstraintSubstitution() &&
+ (SemaRef.inConstraintSubstitution() ||
+ SemaRef.inParameterMappingSubstitution()) &&
maybeInstantiateFunctionParameterToScope(PVD))
return nullptr;
@@ -2759,18 +2800,29 @@ TemplateInstantiator::TransformExprRequirement(concepts::ExprRequirement *Req) {
concepts::NestedRequirement *
TemplateInstantiator::TransformNestedRequirement(
concepts::NestedRequirement *Req) {
- if (!Req->isDependent() && !AlwaysRebuild())
- return Req;
+
+ ASTContext &C = SemaRef.Context;
+
+ Expr *Constraint = Req->getConstraintExpr();
+ ConstraintSatisfaction Satisfaction;
+
+ auto NestedReqWithDiag = [&C, this](Expr *E,
+ ConstraintSatisfaction Satisfaction) {
+ Satisfaction.IsSatisfied = false;
+ SmallString<128> Entity;
+ llvm::raw_svector_ostream OS(Entity);
+ E->printPretty(OS, nullptr, SemaRef.getPrintingPolicy());
+ return new (C) concepts::NestedRequirement(
+ SemaRef.Context, C.backupStr(Entity), std::move(Satisfaction));
+ };
+
if (Req->hasInvalidConstraint()) {
if (AlwaysRebuild())
return RebuildNestedRequirement(Req->getInvalidConstraintEntity(),
Req->getConstraintSatisfaction());
return Req;
}
- Sema::InstantiatingTemplate ReqInst(SemaRef,
- Req->getConstraintExpr()->getBeginLoc(), Req,
- Sema::InstantiatingTemplate::ConstraintsCheck{},
- Req->getConstraintExpr()->getSourceRange());
+
if (!getEvaluateConstraints()) {
ExprResult TransConstraint = TransformExpr(Req->getConstraintExpr());
if (TransConstraint.isInvalid() || !TransConstraint.get())
@@ -2783,45 +2835,45 @@ TemplateInstantiator::TransformNestedRequirement(
SemaRef.Context, TransConstraint.get(), Satisfaction);
}
- ExprResult TransConstraint;
- ConstraintSatisfaction Satisfaction;
- TemplateDeductionInfo Info(Req->getConstraintExpr()->getBeginLoc());
+ bool Success;
+ Expr *NewConstraint;
+ TemplateDeductionInfo Info(Constraint->getBeginLoc());
{
EnterExpressionEvaluationContext ContextRAII(
SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated);
- Sema::SFINAETrap Trap(SemaRef);
- Sema::InstantiatingTemplate ConstrInst(SemaRef,
- Req->getConstraintExpr()->getBeginLoc(), Req, Info,
- Req->getConstraintExpr()->getSourceRange());
+
+ Sema::InstantiatingTemplate ConstrInst(
+ SemaRef, Constraint->getBeginLoc(), Req,
+ Sema::InstantiatingTemplate::ConstraintsCheck(),
+ Constraint->getSourceRange());
+
if (ConstrInst.isInvalid())
return nullptr;
- llvm::SmallVector<Expr *> Result;
- if (!SemaRef.CheckConstraintSatisfaction(
- nullptr,
- AssociatedConstraint(Req->getConstraintExpr(),
- SemaRef.ArgPackSubstIndex),
- Result, TemplateArgs, Req->getConstraintExpr()->getSourceRange(),
- Satisfaction) &&
- !Result.empty())
- TransConstraint = Result[0];
- assert(!Trap.hasErrorOccurred() && "Substitution failures must be handled "
- "by CheckConstraintSatisfaction.");
+
+ Sema::SFINAETrap Trap(SemaRef);
+
+ Success = !SemaRef.CheckConstraintSatisfaction(
+ Req, AssociatedConstraint(Constraint, SemaRef.ArgPackSubstIndex),
+ TemplateArgs, Constraint->getSourceRange(), Satisfaction,
+ /*TopLevelConceptId=*/nullptr, &NewConstraint);
+
+ assert(!Success || !Trap.hasErrorOccurred() &&
+ "Substitution failures must be handled "
+ "by CheckConstraintSatisfaction.");
}
- ASTContext &C = SemaRef.Context;
- if (TransConstraint.isUsable() &&
- TransConstraint.get()->isInstantiationDependent())
- return new (C) concepts::NestedRequirement(TransConstraint.get());
- if (TransConstraint.isInvalid() || !TransConstraint.get() ||
- Satisfaction.HasSubstitutionFailure()) {
- SmallString<128> Entity;
- llvm::raw_svector_ostream OS(Entity);
- Req->getConstraintExpr()->printPretty(OS, nullptr,
- SemaRef.getPrintingPolicy());
- return new (C) concepts::NestedRequirement(
- SemaRef.Context, C.backupStr(Entity), Satisfaction);
+
+ if (!Success || Satisfaction.HasSubstitutionFailure())
+ return NestedReqWithDiag(Constraint, Satisfaction);
+
+ // FIXME: const correctness
+ // MLTAL might be dependent.
+ if (!NewConstraint) {
+ if (!Satisfaction.IsSatisfied)
+ return NestedReqWithDiag(Constraint, Satisfaction);
+
+ NewConstraint = Constraint;
}
- return new (C)
- concepts::NestedRequirement(C, TransConstraint.get(), Satisfaction);
+ return new (C) concepts::NestedRequirement(C, NewConstraint, Satisfaction);
}
TypeSourceInfo *Sema::SubstType(TypeSourceInfo *T,
@@ -3078,7 +3130,7 @@ bool Sema::SubstTypeConstraint(
const ASTTemplateArgumentListInfo *TemplArgInfo =
TC->getTemplateArgsAsWritten();
- if (!EvaluateConstraints) {
+ if (!EvaluateConstraints && !inParameterMappingSubstitution()) {
UnsignedOrNone Index = TC->getArgPackSubstIndex();
if (!Index)
Index = SemaRef.ArgPackSubstIndex;
@@ -4378,6 +4430,16 @@ bool Sema::SubstTemplateArguments(
return Instantiator.TransformTemplateArguments(Args.begin(), Args.end(), Out);
}
+bool Sema::SubstTemplateArgumentsInParameterMapping(
+ ArrayRef<TemplateArgumentLoc> Args, SourceLocation BaseLoc,
+ const MultiLevelTemplateArgumentList &TemplateArgs,
+ TemplateArgumentListInfo &Out, bool BuildPackExpansionTypes) {
+ TemplateInstantiator Instantiator(
+ TemplateInstantiator::ForParameterMappingSubstitution, *this, BaseLoc,
+ TemplateArgs, BuildPackExpansionTypes);
+ return Instantiator.TransformTemplateArguments(Args.begin(), Args.end(), Out);
+}
+
ExprResult
Sema::SubstExpr(Expr *E, const MultiLevelTemplateArgumentList &TemplateArgs) {
if (!E)
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 6967301..51b55b8 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -3722,10 +3722,6 @@ public:
ParentContext);
}
- /// Build a new Objective-C boxed expression.
- ///
- /// By default, performs semantic analysis to build the new expression.
- /// Subclasses may override this routine to provide different behavior.
ExprResult RebuildConceptSpecializationExpr(NestedNameSpecifierLoc NNS,
SourceLocation TemplateKWLoc, DeclarationNameInfo ConceptNameInfo,
NamedDecl *FoundDecl, ConceptDecl *NamedConcept,
@@ -5110,9 +5106,13 @@ bool TreeTransform<Derived>::TransformTemplateArguments(
typedef TemplateArgumentLocInventIterator<Derived,
TemplateArgument::pack_iterator>
PackLocIterator;
+
+ TemplateArgumentListInfo *PackOutput = &Outputs;
+ TemplateArgumentListInfo New;
+
if (TransformTemplateArguments(
PackLocIterator(*this, In.getArgument().pack_begin()),
- PackLocIterator(*this, In.getArgument().pack_end()), Outputs,
+ PackLocIterator(*this, In.getArgument().pack_end()), *PackOutput,
Uneval))
return true;
@@ -5179,7 +5179,6 @@ bool TreeTransform<Derived>::TransformTemplateArguments(
}
return false;
-
}
// FIXME: Find ways to reduce code duplication for pack expansions.
@@ -6247,7 +6246,7 @@ ParmVarDecl *TreeTransform<Derived>::TransformFunctionTypeParam(
/* DefArg */ nullptr);
newParm->setScopeInfo(OldParm->getFunctionScopeDepth(),
OldParm->getFunctionScopeIndex() + indexAdjustment);
- transformedLocalDecl(OldParm, {newParm});
+ getDerived().transformedLocalDecl(OldParm, {newParm});
return newParm;
}
@@ -7082,11 +7081,11 @@ QualType TreeTransform<Derived>::TransformUnaryTransformType(
TypeLocBuilder &TLB,
UnaryTransformTypeLoc TL) {
QualType Result = TL.getType();
+ TypeSourceInfo *NewBaseTSI = TL.getUnderlyingTInfo();
if (Result->isDependentType()) {
const UnaryTransformType *T = TL.getTypePtr();
- TypeSourceInfo *NewBaseTSI =
- getDerived().TransformType(TL.getUnderlyingTInfo());
+ NewBaseTSI = getDerived().TransformType(TL.getUnderlyingTInfo());
if (!NewBaseTSI)
return QualType();
QualType NewBase = NewBaseTSI->getType();
@@ -7101,7 +7100,7 @@ QualType TreeTransform<Derived>::TransformUnaryTransformType(
UnaryTransformTypeLoc NewTL = TLB.push<UnaryTransformTypeLoc>(Result);
NewTL.setKWLoc(TL.getKWLoc());
NewTL.setParensRange(TL.getParensRange());
- NewTL.setUnderlyingTInfo(TL.getUnderlyingTInfo());
+ NewTL.setUnderlyingTInfo(NewBaseTSI);
return Result;
}
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index cf32d4f..5456e73 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2424,7 +2424,7 @@ void ASTDeclReader::VisitImplicitConceptSpecializationDecl(
VisitDecl(D);
llvm::SmallVector<TemplateArgument, 4> Args;
for (unsigned I = 0; I < D->NumTemplateArgs; ++I)
- Args.push_back(Record.readTemplateArgument(/*Canonicalize=*/true));
+ Args.push_back(Record.readTemplateArgument(/*Canonicalize=*/false));
D->setTemplateArguments(Args);
}
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 70b898a..eef97a8 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -807,15 +807,19 @@ readConstraintSatisfaction(ASTRecordReader &Record) {
if (!Satisfaction.IsSatisfied) {
unsigned NumDetailRecords = Record.readInt();
for (unsigned i = 0; i != NumDetailRecords; ++i) {
- if (/* IsDiagnostic */Record.readInt()) {
+ auto Kind = Record.readInt();
+ if (Kind == 0) {
SourceLocation DiagLocation = Record.readSourceLocation();
StringRef DiagMessage = C.backupStr(Record.readString());
- Satisfaction.Details.emplace_back(
- new (C) ConstraintSatisfaction::SubstitutionDiagnostic(
- DiagLocation, DiagMessage));
- } else
+ Satisfaction.Details.emplace_back(new (
+ C) ConstraintSubstitutionDiagnostic(DiagLocation, DiagMessage));
+ } else if (Kind == 1) {
Satisfaction.Details.emplace_back(Record.readExpr());
+ } else {
+ assert(Kind == 2);
+ Satisfaction.Details.emplace_back(Record.readConceptReference());
+ }
}
}
return Satisfaction;
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index ebda91e..acf3453 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -482,14 +482,20 @@ addConstraintSatisfaction(ASTRecordWriter &Record,
if (!Satisfaction.IsSatisfied) {
Record.push_back(Satisfaction.NumRecords);
for (const auto &DetailRecord : Satisfaction) {
- auto *E = dyn_cast<Expr *>(DetailRecord);
- Record.push_back(/* IsDiagnostic */ E == nullptr);
- if (E)
- Record.AddStmt(E);
- else {
- auto *Diag = cast<std::pair<SourceLocation, StringRef> *>(DetailRecord);
+ if (auto *Diag = dyn_cast<const ConstraintSubstitutionDiagnostic *>(
+ DetailRecord)) {
+ Record.push_back(/*Kind=*/0);
Record.AddSourceLocation(Diag->first);
Record.AddString(Diag->second);
+ continue;
+ }
+ if (auto *E = dyn_cast<const Expr *>(DetailRecord)) {
+ Record.push_back(/*Kind=*/1);
+ Record.AddStmt(const_cast<Expr *>(E));
+ } else {
+ Record.push_back(/*Kind=*/2);
+ auto *CR = cast<const ConceptReference *>(DetailRecord);
+ Record.AddConceptReference(CR);
}
}
}
diff --git a/clang/test/AST/ast-dump-concepts.cpp b/clang/test/AST/ast-dump-concepts.cpp
index 84d981d..9419dba 100644
--- a/clang/test/AST/ast-dump-concepts.cpp
+++ b/clang/test/AST/ast-dump-concepts.cpp
@@ -20,8 +20,9 @@ struct Foo {
// CHECK: TemplateTypeParmDecl {{.*}} referenced Concept {{.*}} 'binary_concept'
// CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} <col:13, col:31> 'bool' Concept {{.*}} 'binary_concept'
// CHECK-NEXT: |-ImplicitConceptSpecializationDecl {{.*}} <line:13:9> col:9
- // CHECK-NEXT: | |-TemplateArgument type 'type-parameter-1-0'
- // CHECK-NEXT: | | `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent {{.*}}depth 1 index 0
+ // CHECK-NEXT: | |-TemplateArgument type 'R'
+ // CHECK-NEXT: | | `-TemplateTypeParmType {{.*}} 'R' dependent {{.*}}depth 1 index 0
+ // CHECK-NEXT: | | `-TemplateTypeParm {{.*}} 'R'
// CHECK-NEXT: | `-TemplateArgument type 'int'
// CHECK-NEXT: | `-BuiltinType {{.*}} 'int'
// CHECK-NEXT: |-TemplateArgument {{.*}} type 'R'
@@ -35,8 +36,9 @@ struct Foo {
// CHECK: TemplateTypeParmDecl {{.*}} referenced Concept {{.*}} 'unary_concept'
// CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} <col:13> 'bool'
// CHECK-NEXT: |-ImplicitConceptSpecializationDecl {{.*}} <line:10:9> col:9
- // CHECK-NEXT: | `-TemplateArgument type 'type-parameter-1-0'
- // CHECK-NEXT: | `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent {{.*}}depth 1 index 0
+ // CHECK-NEXT: | `-TemplateArgument type 'R'
+ // CHECK-NEXT: | `-TemplateTypeParmType {{.*}} 'R' dependent {{.*}}depth 1 index 0
+ // CHECK-NEXT: | `-TemplateTypeParm {{.*}} 'R'
template <unary_concept R>
Foo(R);
diff --git a/clang/test/AST/ast-dump-ctad-alias.cpp b/clang/test/AST/ast-dump-ctad-alias.cpp
index 781fb9f..9a3adbc 100644
--- a/clang/test/AST/ast-dump-ctad-alias.cpp
+++ b/clang/test/AST/ast-dump-ctad-alias.cpp
@@ -185,17 +185,18 @@ void foo() {
// CHECK-NEXT: | |-BinaryOperator {{.*}} 'bool' '&&'
// CHECK-NEXT: | | |-ConceptSpecializationExpr {{.*}} 'bool' Concept {{.*}} 'invocable'
// CHECK-NEXT: | | | |-ImplicitConceptSpecializationDecl {{.*}}
-// CHECK-NEXT: | | | | |-TemplateArgument type 'type-parameter-0-2'
-// CHECK-NEXT: | | | | | `-TemplateTypeParmType {{.*}} 'type-parameter-0-2' dependent depth 0 index 2
-// CHECK-NEXT: | | | | `-TemplateArgument pack '<GH124715::Packs<type-parameter-0-1...>>'
-// CHECK-NEXT: | | | | `-TemplateArgument type 'GH124715::Packs<type-parameter-0-1...>'
-// CHECK-NEXT: | | | | `-TemplateSpecializationType {{.*}} 'GH124715::Packs<type-parameter-0-1...>' dependent
-// CHECK-NEXT: | | | | |-name: 'GH124715::Packs'
+// CHECK-NEXT: | | | | |-TemplateArgument type 'U'
+// CHECK-NEXT: | | | | | `-TemplateTypeParmType {{.*}} 'U' dependent depth 0 index 2
+// CHECK-NEXT: | | | | | `-TemplateTypeParm {{.*}} 'U'
+// CHECK-NEXT: | | | | `-TemplateArgument pack '<Packs<Ts...>>'
+// CHECK-NEXT: | | | | `-TemplateArgument type 'Packs<Ts...>'
+// CHECK-NEXT: | | | | `-TemplateSpecializationType {{.*}} 'Packs<Ts...>' dependent
+// CHECK-NEXT: | | | | |-name: 'Packs':'GH124715::Packs' qualified
// CHECK-NEXT: | | | | | `-ClassTemplateDecl {{.*}} Packs
-// CHECK-NEXT: | | | | `-TemplateArgument pack '<type-parameter-0-1...>'
-// CHECK-NEXT: | | | | `-TemplateArgument type 'type-parameter-0-1...'
-// CHECK-NEXT: | | | | `-PackExpansionType {{.*}} 'type-parameter-0-1...' dependent
-// CHECK-NEXT: | | | | `-TemplateTypeParmType {{.*}} 'type-parameter-0-1' dependent contains_unexpanded_pack depth 0 index 1 pack
+// CHECK-NEXT: | | | | `-TemplateArgument type 'Ts...'
+// CHECK-NEXT: | | | | `-PackExpansionType {{.*}} 'Ts...' dependent
+// CHECK-NEXT: | | | | `-TemplateTypeParmType {{.*}} 'Ts' dependent contains_unexpanded_pack depth 0 index 1 pack
+// CHECK-NEXT: | | | | `-TemplateTypeParm {{.*}} 'Ts'
// CHECK-NEXT: | | | |-TemplateArgument {{.*}} type 'U':'type-parameter-0-2'
// CHECK-NEXT: | | | | `-TemplateTypeParmType {{.*}} 'U' dependent depth 0 index 2
// CHECK-NEXT: | | | | `-TemplateTypeParm {{.*}} 'U'
diff --git a/clang/test/CXX/drs/cwg25xx.cpp b/clang/test/CXX/drs/cwg25xx.cpp
index 5c2948f..0e0fc73 100644
--- a/clang/test/CXX/drs/cwg25xx.cpp
+++ b/clang/test/CXX/drs/cwg25xx.cpp
@@ -243,19 +243,20 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07
// since-cxx20-note@#cwg2565-VC {{because 'b' would be invalid: argument may not have 'void' type}}
template<typename T>
- concept ErrorRequires = requires (ErrorRequires auto x) {
+ concept ErrorRequires = requires (ErrorRequires auto x) { // #cwg2565-expr
// since-cxx20-error@-1 {{a concept definition cannot refer to itself}}
// since-cxx20-note@-2 {{declared here}}
// since-cxx20-error@-3 {{'auto' not allowed in requires expression parameter}}
x;
};
static_assert(ErrorRequires<int>);
- // since-cxx20-error@-1 {{static assertion failed}}
- // since-cxx20-note@-2 {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+ // since-cxx20-error@-1 {{static assertion failed}} \
+ // since-cxx20-note@-1 {{because 'int' does not satisfy 'ErrorRequires'}} \
+ // since-cxx20-note@#cwg2565-expr {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
template<typename T>
concept NestedErrorInRequires = requires (T x) { // #cwg2565-NEIR
- requires requires (NestedErrorInRequires auto y) {
+ requires requires (NestedErrorInRequires auto y) { // #cwg2565-NEIR-inner
// since-cxx20-error@-1 {{a concept definition cannot refer to itself}}
// since-cxx20-note@#cwg2565-NEIR {{declared here}}
// since-cxx20-error@-3 {{'auto' not allowed in requires expression parameter}}
@@ -263,8 +264,9 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07
};
};
static_assert(NestedErrorInRequires<int>);
- // since-cxx20-error@-1 {{static assertion failed}}
- // since-cxx20-note@-2 {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+ // since-cxx20-error@-1 {{static assertion failed}} \
+ // since-cxx20-note@-1 {{because 'int' does not satisfy 'NestedErrorInRequires'}} \
+ // since-cxx20-note-re@#cwg2565-NEIR-inner {{because {{.*}} would be invalid: constraint depends on a previously diagnosed expression}}
#endif
} // namespace cwg2565
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp
index 28b5d0a..af2fc93 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp
@@ -140,7 +140,8 @@ concept C7 = sizeof(T) == 1 || sizeof(
::type) == 1;
static_assert(!C6<short>);
-static_assert(!C6<char>); // expected-note{{while checking the satisfaction of concept 'C6<char>' requested here}}
+static_assert(!C6<char>);
+// expected-note@-1 {{while checking the satisfaction of concept 'C6<char>' requested here}}
static_assert(C7<char>);
static_assert(!C7<short>); // expected-note{{while checking the satisfaction of concept 'C7<short>' requested here}}
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
index 31587a9..af2dce8 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
@@ -35,14 +35,14 @@ using r2i2 = r2<A>; // expected-error{{constraints not satisfied for class templ
using r2i3 = r2<D>;
using r2i4 = r2<const D>; // expected-error{{constraints not satisfied for class template 'r2' [with T = const D]}}
-template<typename T> requires requires { { sizeof(T) }; } // expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}} expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'nonexistent'}}
+template<typename T> requires requires { { sizeof(T) }; } // expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}} expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'class nonexistent'}}
struct r3 {};
using r3i1 = r3<int>;
using r3i2 = r3<A>;
using r3i3 = r3<A &>;
using r3i4 = r3<void>; // expected-error{{constraints not satisfied for class template 'r3' [with T = void]}}
-using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = nonexistent]}}
+using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = class nonexistent]}}
// Non-dependent expressions
@@ -89,7 +89,7 @@ template<typename T>
concept Large = sizeof(typename remove_reference<T>::type) >= 4;
// expected-note@-1{{because 'sizeof(typename remove_reference<short &>::type) >= 4' (2 >= 4) evaluated to false}}
-template<typename T> requires requires (T t) { { t } -> Large; } // expected-note{{because 'short &' does not satisfy 'Large':}}
+template<typename T> requires requires (T t) { { t } -> Large; } // expected-note{{because 'short &' does not satisfy 'Large'}}
struct r7 {};
using r7i1 = r7<int>;
@@ -149,7 +149,7 @@ namespace std_example {
template<typename T> constexpr bool is_same_v<T, T> = true;
template<typename T, typename U> concept same_as = is_same_v<T, U>;
- // expected-note@-1 {{because 'is_same_v<int, int *>' evaluated to false}}
+ // expected-note@-1 {{because 'is_same_v<int, typename std_example::T2::inner>' evaluated to false}}
static_assert(C1<int>);
static_assert(C1<int*>);
@@ -160,7 +160,7 @@ namespace std_example {
template<typename T> concept C2 =
requires(T x) {
{*x} -> same_as<typename T::inner>;
- // expected-note@-1{{because type constraint 'same_as<int, typename std_example::T2::inner>' was not satisfied:}}
+ // expected-note@-1{{because 'same_as<int, typename std_example::T2::inner>' evaluated to false}}
// expected-note@-2{{because '*x' would be invalid: indirection requires pointer operand ('int' invalid)}}
};
@@ -173,9 +173,9 @@ namespace std_example {
int operator *() { return 0; }
};
static_assert(C2<T1>);
- template<C2 T> struct C2_check {}; // expected-note{{because 'int' does not satisfy 'C2'}} expected-note{{because 'std_example::T2' does not satisfy 'C2'}}
+ template<C2 T> struct C2_check {}; // expected-note{{because 'int' does not satisfy 'C2'}} expected-note{{because 'T2' does not satisfy 'C2'}}
using c2c1 = C2_check<int>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = int]}}
- using c2c2 = C2_check<T2>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = std_example::T2]}}
+ using c2c2 = C2_check<T2>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = T2]}}
template<typename T>
void g(T t) noexcept(sizeof(T) == 1) {}
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
index 033ae34..70a96be 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
@@ -43,11 +43,10 @@ namespace std_example {
requires sizeof(a) == 4; // OK
requires a == 0; // expected-error{{substitution into constraint expression resulted in a non-constant expression}}
// expected-note@-1{{while checking the satisfaction of nested requirement requested here}}
- // expected-note@-2{{in instantiation of requirement here}}
- // expected-note@-3{{while checking the satisfaction of nested requirement requested here}}
- // expected-note@-6{{while substituting template arguments into constraint expression here}}
- // expected-note@-5{{function parameter 'a' with unknown value cannot be used in a constant expression}}
- // expected-note@-8{{declared here}}
+ // expected-note@-2{{while checking the satisfaction of nested requirement requested here}}
+ // expected-note@-5{{while substituting template arguments into constraint expression here}}
+ // expected-note@-4{{function parameter 'a' with unknown value cannot be used in a constant expression}}
+ // expected-note@-7{{declared here}}
};
static_assert(C2<int>); // expected-error{{static assertion failed}}
// expected-note@-1{{while checking the satisfaction of concept 'C2<int>' requested here}}
@@ -84,31 +83,26 @@ static_assert(Pipes<S>);
static_assert(Pipes<double>);
static_assert(Amps1<S>);
-static_assert(!Amps1<double>);
+static_assert(Amps1<double>);
static_assert(Amps2<S>);
-static_assert(!Amps2<double>);
+static_assert(Amps2<double>);
template<class T>
-void foo1() requires requires (T x) { // #foo1
+void foo1() requires requires (T x) {
requires
- True<decltype(x.value)> // #foo1Value
+ True<decltype(x.value)>
&& True<T>;
} {}
template<class T> void fooPipes() requires Pipes<T> {}
-template<class T> void fooAmps1() requires Amps1<T> {} // #fooAmps1
+template<class T> void fooAmps1() requires Amps1<T> {}
void foo() {
foo1<S>();
- foo1<int>(); // expected-error {{no matching function for call to 'foo1'}}
- // expected-note@#foo1Value {{because 'True<decltype(x.value)> && True<T>' would be invalid: member reference base type 'int' is not a structure or union}}
- // expected-note@#foo1 {{candidate template ignored: constraints not satisfied [with T = int]}}
+ foo1<int>();
fooPipes<S>();
fooPipes<int>();
fooAmps1<S>();
- fooAmps1<int>(); // expected-error {{no matching function for call to 'fooAmps1'}}
- // expected-note@#fooAmps1 {{candidate template ignored: constraints not satisfied [with T = int]}}
- // expected-note@#fooAmps1 {{because 'int' does not satisfy 'Amps1'}}
- // expected-note@#Amps1 {{because 'True<decltype(x.value)> && True<T> && !False<T>' would be invalid: member reference base type 'int' is not a structure or union}}
+ fooAmps1<int>();
}
template<class T>
@@ -158,15 +152,16 @@ void func() {
// expected-note@#bar {{while substituting template arguments into constraint expression here}}
// expected-note@#bar {{while checking the satisfaction of nested requirement requested here}}
// expected-note@#bar {{candidate template ignored: constraints not satisfied [with T = False]}}
- // expected-note@#bar {{because 'X<SubstitutionFailureNestedRequires::ErrorExpressions_NotSF::False>::value' evaluated to false}}
+ // expected-note@#bar {{because 'X<False>::value' evaluated to false}}
bar<int>();
+ // expected-error@-1 {{no matching function for call to 'bar'}} \
// expected-note@-1 {{while checking constraint satisfaction for template 'bar<int>' required here}} \
- // expected-note@-1 {{while substituting deduced template arguments into function template 'bar' [with T = int]}}
+ // expected-note@-1 {{while substituting deduced template arguments into function template 'bar' [with T = int]}} \
// expected-note@#bar {{in instantiation of static data member}}
- // expected-note@#bar {{in instantiation of requirement here}}
// expected-note@#bar {{while checking the satisfaction of nested requirement requested here}}
// expected-note@#bar {{while substituting template arguments into constraint expression here}}
+ // expected-note@#bar {{candidate template ignored}}
// expected-error@#X_Value {{type 'int' cannot be used prior to '::' because it has no members}}
}
}
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
index 5199708..5dcb188 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
@@ -39,14 +39,14 @@ using r2i4 = r2<const D>; // expected-error{{constraints not satisfied for class
template<typename T> requires requires { sizeof(T); }
// expected-note@-1{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}}
-// expected-note@-2{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'nonexistent'}}
+// expected-note@-2{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'class nonexistent'}}
struct r3 {};
using r3i1 = r3<int>;
using r3i2 = r3<A>;
using r3i3 = r3<A &>;
using r3i4 = r3<void>; // expected-error{{constraints not satisfied for class template 'r3' [with T = void]}}
-using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = nonexistent]}}
+using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = class nonexistent]}}
template<typename T> requires requires (T t) { 0; "a"; (void)'a'; }
struct r4 {};
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
index 5433cfb..28dff33 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
@@ -182,14 +182,14 @@ namespace std_example {
static_assert(C1<has_inner_and_type> && C2<has_inner_and_type> && C3<has_inner_and_type>);
template<C1 T> struct C1_check {};
// expected-note@-1 {{because 'int' does not satisfy 'C1'}}
- // expected-note@-2 {{because 'std_example::has_type' does not satisfy 'C1'}}
+ // expected-note@-2 {{because 'has_type' does not satisfy 'C1'}}
template<C2 T> struct C2_check {};
- // expected-note@-1 {{because 'std_example::has_inner' does not satisfy 'C2'}}
+ // expected-note@-1 {{because 'has_inner' does not satisfy 'C2'}}
template<C3 T> struct C3_check {};
// expected-note@-1 {{because 'void' does not satisfy 'C3'}}
using c1 = C1_check<int>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = int]}}
- using c2 = C1_check<has_type>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = std_example::has_type]}}
- using c3 = C2_check<has_inner>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = std_example::has_inner]}}
+ using c2 = C1_check<has_type>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = has_type]}}
+ using c3 = C2_check<has_inner>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = has_inner]}}
using c4 = C3_check<void>; // expected-error{{constraints not satisfied for class template 'C3_check' [with T = void]}}
}
@@ -199,10 +199,10 @@ template <typename T> concept C = requires { requires requires { T::a; }; };
// expected-note@-1 {{because 'T::a' would be invalid: no member named 'a' in 'PR48656::T1'}}
template <C...> struct A {};
-// expected-note@-1 {{because 'PR48656::T1' does not satisfy 'C'}}
+// expected-note@-1 {{because 'T1' does not satisfy 'C'}}
struct T1 {};
-template struct A<T1>; // expected-error {{constraints not satisfied for class template 'A' [with $0 = <PR48656::T1>]}}
+template struct A<T1>; // expected-error {{constraints not satisfied for class template 'A' [with $0 = <T1>]}}
struct T2 { static constexpr bool a = false; };
template struct A<T2>;
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp
index 59e6a48..6dea0c6 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp
@@ -28,9 +28,8 @@ template<typename T> requires requires {
requires S<T>{};
// expected-error@-1{{atomic constraint must be of type 'bool' (found 'S<int>')}}
// expected-note@-2{{while checking the satisfaction}}
- // expected-note@-3{{in instantiation of requirement}}
- // expected-note@-4{{while checking the satisfaction}}
- // expected-note@-6{{while substituting template arguments}}
+ // expected-note@-3{{while checking the satisfaction of nested requirement}}
+ // expected-note@-5{{while substituting template arguments}}
// expected-note@#F3INST{{while checking constraint satisfaction}}
// expected-note@#F3INST{{while substituting deduced template arguments into function template 'f3' [with T = int]}}
//
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
index 3992835..34c5c5d 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
@@ -1,21 +1,31 @@
// RUN: %clang_cc1 -std=c++2a -x c++ -verify %s
+// RUN: %clang_cc1 -std=c++2c -x c++ -verify %s
template<typename T> concept True = true;
-template<typename T> concept Foo = True<T*>;
-template<typename T> concept Bar = Foo<T&>;
-template<typename T> requires Bar<T> struct S { };
-template<typename T> requires Bar<T> && true struct S<T> { };
+template<typename T> concept Foo = True<T*>; // #Foo
+template<typename T> concept Bar = Foo<T&>; // #Bar
+template<typename T> requires Bar<T> struct S { }; // #S
+template<typename T> requires Bar<T> && true struct S<T> { }; // #SpecS
+// expected-error@-1 {{class template partial specialization is not more specialized than the primary template}}
+// expected-error@#Foo 2{{'type name' declared as a pointer to a reference of type 'T &'}}
+// expected-note@#SpecS {{while substituting into concept arguments here}}
+// expected-note@#S {{while substituting into concept arguments here}}
+// expected-note@#Bar 2{{while substituting into concept arguments here}}
+// expected-note@#S {{template is declared here}}
+
+
template<typename T> concept True2 = sizeof(T) >= 0;
-template<typename T> concept Foo2 = True2<T*>;
-// expected-error@-1{{'type name' declared as a pointer to a reference of type 'type-parameter-0-0 &'}}
-template<typename T> concept Bar2 = Foo2<T&>;
-// expected-note@-1{{while substituting into concept arguments here; substitution failures not allowed in concept arguments}}
-template<typename T> requires Bar2<T> struct S2 { };
+template<typename T> concept Foo2 = True2<T*>; // #Foo2
+
+template<typename T> concept Bar2 = Foo2<T&>; // #Bar2
+// expected-note@-1 3{{while substituting into concept arguments here; substitution failures not allowed in concept arguments}}
+template<typename T> requires Bar2<T> struct S2 { }; // #SpecS2_1
// expected-note@-1{{template is declared here}}
-template<typename T> requires Bar2<T> && true struct S2<T> { };
+template<typename T> requires Bar2<T> && true struct S2<T> { }; // #SpecS2_2
// expected-error@-1{{class template partial specialization is not more specialized than the primary template}}
-// expected-note@-2{{while calculating associated constraint of template 'S2<T>' here}}
+// expected-error@#Foo2{{'type name' declared as a pointer to a reference of type 'T &'}}
+
namespace type_pack {
template<typename... Args>
@@ -71,16 +81,31 @@ namespace non_type_pack {
namespace PR47174 {
// This checks that we don't crash with a failed substitution on the first constrained argument when
// performing normalization.
-template <Bar2 T, True U>
+template <Bar2 T, True U> // #S3_Header
requires true struct S3; // expected-note {{template is declared here}}
template <True T, True U>
-requires true struct S3<T, U>; // expected-error {{class template partial specialization is not more specialized than the primary template}}
+requires true struct S3<T, U>;
+// expected-error@-1 {{class template partial specialization is not more specialized than the primary template}}
+// expected-error@#Foo2 2{{'type name' declared as a pointer to a reference of type 'T &'}}
+// expected-note@#SpecS2_1 {{while substituting into concept arguments here}}
+// expected-note@#SpecS2_2 {{while substituting into concept arguments here}}
+// expected-note@#S3_Header {{while substituting into concept arguments here}}
+// expected-note@#Bar2 {{while substituting into concept arguments here}}
+
// Same as above, for the second position (but this was already working).
-template <True T, Bar2 U>
-requires true struct S4; // expected-note {{template is declared here}}
+template <True T, Bar2 U> // #S4_Header
+requires true struct S4; // #S4
template <True T, True U>
-requires true struct S4<T, U>; // expected-error {{class template partial specialization is not more specialized than the primary template}}
+requires true struct S4<T, U>; // #S4-spec
+// expected-error@-1 {{class template partial specialization is not more specialized than the primary template}}
+// expected-error@#Foo2 {{'type name' declared as a pointer to a reference of type 'U &'}}
+// expected-note@#S4_Header {{while substituting into concept arguments here}}
+// expected-note@#S4 {{template is declared here}}
+// expected-note@#S4 {{similar constraint expressions not considered equivalent}}
+// expected-note@#S4-spec {{similar constraint expression here}}
+
+
struct X {
template<int> struct Y {
@@ -96,7 +121,7 @@ template<class T> requires C1<T> && C2<T> void t1() = delete; // expected-note {
template void t1<X>();
void t1() { t1<X>(); } // expected-error {{call to deleted function 't1'}}
-template<class T> requires C1<T> void t2() {}; // expected-note 2 {{candidate function}}
+template<class T> requires C1<T> void t2() {}; // expected-note 2 {{candidate function}}
template<class T> requires C2<T> void t2() {}; // expected-note 2 {{candidate function}}
template void t2<X>(); // expected-error {{partial ordering for explicit instantiation of 't2' is ambiguous}}
void t2() { t2<X>(); } // expected-error {{call to 't2' is ambiguous}}
diff --git a/clang/test/CXX/temp/temp.param/p10-2a.cpp b/clang/test/CXX/temp/temp.param/p10-2a.cpp
index 4f5fdd3..c0406f8 100644
--- a/clang/test/CXX/temp/temp.param/p10-2a.cpp
+++ b/clang/test/CXX/temp/temp.param/p10-2a.cpp
@@ -86,16 +86,18 @@ using f1 = F<int>;
using f2 = F<long>; // expected-error {{constraints not satisfied for alias template 'F' [with T = long]}}
template<typename T, typename... Ts>
-concept OneOf = (is_same_v<T, Ts> || ...);
-// expected-note@-1 2{{because 'is_same_v<char, char[1]>' evaluated to false}}
-// expected-note@-2 2{{and 'is_same_v<char, char[2]>' evaluated to false}}
-// expected-note@-3 {{because 'is_same_v<short, int>' evaluated to false}}
-// expected-note@-4 {{and 'is_same_v<short, long>' evaluated to false}}
-// expected-note@-5 {{and 'is_same_v<short, char>' evaluated to false}}
-// expected-note@-6 3{{because 'is_same_v<int, char[1]>' evaluated to false}}
-// expected-note@-7 3{{and 'is_same_v<int, char[2]>' evaluated to false}}
-// expected-note@-8 2{{because 'is_same_v<std::nullptr_t, char>' evaluated to false}}
-// expected-note@-9 2{{and 'is_same_v<std::nullptr_t, int>' evaluated to false}}
+concept OneOf = (is_same_v<T, Ts> || ...); // #OneOf
+// expected-note@#OneOf 2{{because 'is_same_v<char, char[1]>' evaluated to false}}
+// expected-note@#OneOf 2{{and 'is_same_v<char, char[2]>' evaluated to false}}
+// expected-note@#OneOf {{because 'is_same_v<short, int>' evaluated to false}}
+// expected-note@#OneOf {{and 'is_same_v<short, long>' evaluated to false}}
+// expected-note@#OneOf {{and 'is_same_v<short, char>' evaluated to false}}
+// expected-note@#OneOf 3{{because 'is_same_v<int, char[1]>' evaluated to false}}
+// expected-note@#OneOf 3{{and 'is_same_v<int, char[2]>' evaluated to false}}
+// expected-note@#OneOf {{because 'is_same_v<decltype(nullptr), char>' evaluated to false}}
+// expected-note@#OneOf {{because 'is_same_v<std::nullptr_t, char>' evaluated to false}}
+// expected-note@#OneOf {{and 'is_same_v<std::nullptr_t, int>' evaluated to false}}
+// expected-note@#OneOf {{and 'is_same_v<decltype(nullptr), int>' evaluated to false}}
template<OneOf<char[1], char[2]> T, OneOf<int, long, char> U>
// expected-note@-1 2{{because 'OneOf<char, char[1], char[2]>' evaluated to false}}
@@ -124,6 +126,7 @@ using I = int;
using i1 = I<1>;
using i2 = I<'a'>;
+// FIXME: This crashes with -std=c++2c
using i3 = I<nullptr>;
// expected-error@-1 {{constraints not satisfied for alias template 'I' [with x = nullptr]}}
diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
index fd1a5c0..404b928 100644
--- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
+++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
@@ -587,6 +587,23 @@ static_assert(__is_same(decltype(a), A<A<int>>));
} // namespace GH133132
+namespace GH131408 {
+
+struct Node {};
+
+template <class T, Node>
+struct A {
+ A(T) {}
+};
+
+template <class T>
+using AA = A<T, {}>;
+
+AA a{0};
+
+static_assert(__is_same(decltype(a), A<int, Node{}>));
+}
+
namespace GH130604 {
template <typename T> struct A {
A(T);
diff --git a/clang/test/SemaCXX/cxx23-assume.cpp b/clang/test/SemaCXX/cxx23-assume.cpp
index 99a82d9..ce86266 100644
--- a/clang/test/SemaCXX/cxx23-assume.cpp
+++ b/clang/test/SemaCXX/cxx23-assume.cpp
@@ -127,13 +127,12 @@ struct F {
template <typename T>
constexpr int f5() requires C<T> { return 1; } // expected-note {{while checking the satisfaction}}
- // expected-note@-1 {{while substituting template arguments}}
- // expected-note@-2 {{candidate template ignored}}
+ // expected-note@-1 {{candidate template ignored}}
template <typename T>
-constexpr int f5() requires (!C<T>) { return 2; } // expected-note 4 {{while checking the satisfaction}}
- // expected-note@-1 4 {{while substituting template arguments}}
- // expected-note@-2 {{candidate template ignored}}
+constexpr int f5() requires (!C<T>) { return 2; } // expected-note 4 {{while checking the satisfaction}} \
+ // expected-note 4 {{while substituting template arguments}} \
+ // expected-note {{candidate template ignored}}
static_assert(f5<int>() == 1);
static_assert(f5<D>() == 1); // expected-note 3 {{while checking constraint satisfaction}}
diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
index 74b3573..6777dc2 100644
--- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp
+++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
@@ -1257,13 +1257,13 @@ void f() {
(&A::e)(a, a);
// expected-error@-1 {{no matching function for call to 'e'}} \
// expected-note@#tpl-address-e{{candidate template ignored: constraints not satisfied [with T = A, U = A]}} \
- // expected-note@#tpl-address-e{{because '__is_same(tpl_address::A, int)' evaluated to false}}
+ // expected-note@#tpl-address-e{{because '__is_same(A, int)' evaluated to false}}
(&A::e<A>)(a, 0);
(&A::e<A>)(a, a);
// expected-error@-1 {{no matching function for call to 'e'}} \
// expected-note@#tpl-address-e{{candidate template ignored: constraints not satisfied [with T = A, U = A]}} \
- // expected-note@#tpl-address-e{{because '__is_same(tpl_address::A, int)' evaluated to false}}
+ // expected-note@#tpl-address-e{{because '__is_same(A, int)' evaluated to false}}
(&A::e<A, int>)(a, 0);
@@ -1273,12 +1273,12 @@ void f() {
(&A::f<A>)(a);
// expected-error@-1 {{no matching function for call to 'f'}} \
// expected-note@#tpl-address-f{{candidate template ignored: constraints not satisfied [with T = A]}} \
- // expected-note@#tpl-address-f{{because '__is_same(tpl_address::A, int)' evaluated to false}}
+ // expected-note@#tpl-address-f{{because '__is_same(A, int)' evaluated to false}}
(&A::f)(a);
// expected-error@-1 {{no matching function for call to 'f'}} \
// expected-note@#tpl-address-f{{candidate template ignored: constraints not satisfied [with T = A]}} \
- // expected-note@#tpl-address-f{{because '__is_same(tpl_address::A, int)' evaluated to false}}
+ // expected-note@#tpl-address-f{{because '__is_same(A, int)' evaluated to false}}
(&A::g)(a);
(&A::g)(a, 0);
diff --git a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
index 4220486..137f46e 100644
--- a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
+++ b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
@@ -1,7 +1,7 @@
// RUN: %clang_cc1 -std=c++2c -verify %s
-template <class T> concept A = true;
-template <class T> concept C = A<T> && true;
+template <class T> concept A = (T(), true);
+template <class T> concept C = A<T> && true; // #C
template <class T> concept D = A<T> && __is_same(T, int);
@@ -40,13 +40,23 @@ constexpr int i(T...) { return 1; }; // expected-note {{candidate}}
static_assert(i(0) == 1); // expected-error {{call to 'i' is ambiguous}}
-template <class... T> requires (A<T> || ... || true)
-constexpr int j(T...) { return 0; };
-template <class... T> requires (C<T> && ... && true)
-constexpr int j(T...) { return 1; };
+template <class... T> requires (A<T> || ... || true) constexpr int j(T...) { return 0; }; // #j1
+template <class... T> requires (C<T> && ... && true) constexpr int j(T...) { return 1; }; // #j2
static_assert(j(0) == 1);
+// expected-error@-1 {{call to 'j' is ambiguous}}
+// expected-note@#j1 {{candidate function [with T = <int>]}}
+// expected-note@#j2 {{candidate function [with T = <int>]}}
+// expected-note@#j2 {{imilar constraint expressions not considered equivalent}}
+// expected-note@#j1 {{similar constraint expression here}}
+
+
static_assert(j() == 1);
+// expected-error@-1 {{call to 'j' is ambiguous}}
+// expected-note@#j1 {{candidate function [with T = <>]}}
+// expected-note@#j2 {{candidate function [with T = <>]}}
+// expected-note@#j2 {{imilar constraint expressions not considered equivalent}}
+// expected-note@#j1 {{similar constraint expression here}}
@@ -107,7 +117,7 @@ void test() {
}
namespace substitution {
- struct S {
+struct S {
using type = int;
};
@@ -144,51 +154,69 @@ consteval int Or3() requires (C<typename T::type> || ... || C<typename U::type>)
static_assert(And1<>() == 1);
static_assert(And1<S>() == 1);
static_assert(And1<S, S>() == 1);
+// FIXME: The diagnostics are not so great
static_assert(And1<int>() == 1); // expected-error {{no matching function for call to 'And1'}}
- // expected-note@#and1 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#and1 {{because substituted constraint expression is ill-formed}}
+ // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <int>]}}
+ // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
+ // expected-note@#C {{because 'T' does not satisfy 'A'}}
static_assert(And1<S, int>() == 1); // expected-error {{no matching function for call to 'And1'}}
- // expected-note@#and1 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#and1 {{because substituted constraint expression is ill-formed}}
+ // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <S, int>]}}
+ // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
+ // expected-note@#C {{because 'T' does not satisfy 'A'}}
static_assert(And1<int, S>() == 1); // expected-error {{no matching function for call to 'And1'}}
- // expected-note@#and1 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#and1 {{because substituted constraint expression is ill-formed}}
+ // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <int, S>]}}
+ // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
+ // expected-note@#C {{because 'T' does not satisfy 'A'}}
static_assert(And2<S>() == 2);
static_assert(And2<S, S>() == 2);
-static_assert(And2<int>() == 2);
+static_assert(And2<int>() == 2); // expected-error {{no matching function for call to 'And2'}}
+ // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
+ // expected-note@#and2 {{because 'typename U::type' does not satisfy 'C'}}
+ // expected-note@#C {{because 'T' does not satisfy 'A'}}
+
static_assert(And2<int, int>() == 2); // expected-error {{no matching function for call to 'And2'}}
- // expected-note@#and2 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#and2 {{because substituted constraint expression is ill-formed}}
+ // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = S, U = <int>]}} \
+ // expected-note@#and2 {{because 'typename U::type' does not satisfy 'C'}}
+ // expected-note@#C {{because 'T' does not satisfy 'A'}}
static_assert(And2<S, int>() == 2); // expected-error {{no matching function for call to 'And2'}}
- // expected-note@#and2 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#and2 {{because substituted constraint expression is ill-formed}}
+ // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <S>]}}
+ // expected-note@#and2 {{because 'typename T::type' does not satisfy 'C'}}
+ // expected-note@#C {{because 'T' does not satisfy 'A'}}
static_assert(And2<int, S>() == 2); // expected-error {{no matching function for call to 'And2'}}
- // expected-note@#and2 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#and2 {{because substituted constraint expression is ill-formed}}
+ // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <int>]}}
+ // expected-note@#and2 {{because 'typename T::type' does not satisfy 'C'}}
+ // expected-note@#C {{because 'T' does not satisfy 'A'}}
static_assert(And3<S>() == 3);
static_assert(And3<S, S>() == 3);
static_assert(And3<int>() == 3); // expected-error {{no matching function for call to 'And3'}}
- // expected-note@#and3 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#and3 {{because substituted constraint expression is ill-formed}}
+ // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
+ // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
+ // expected-note@#C {{because 'T' does not satisfy 'A'}}
+
static_assert(And3<int, int>() == 3); // expected-error {{no matching function for call to 'And3'}}
- // expected-note@#and3 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#and3 {{because substituted constraint expression is ill-formed}}
+ // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <int>]}}
+ // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
+ // expected-note@#C {{because 'T' does not satisfy 'A'}}
+
static_assert(And3<S, int>() == 3); // expected-error {{no matching function for call to 'And3'}}
- // expected-note@#and3 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#and3 {{because substituted constraint expression is ill-formed}}
+ // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = S, U = <int>]}}
+ // expected-note@#and3 {{because 'typename U::type' does not satisfy 'C'}}
+ // expected-note@#C {{because 'T' does not satisfy 'A'}}
+
static_assert(And3<int, S>() == 3); // expected-error {{no matching function for call to 'And3'}}
- // expected-note@#and3 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#and3 {{because substituted constraint expression is ill-formed}}
+ // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <S>]}}
+ // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
+ // expected-note@#C {{because 'T' does not satisfy 'A'}}
static_assert(Or1<>() == 1); // expected-error {{no matching function for call to 'Or1'}}
@@ -198,25 +226,26 @@ static_assert(Or1<int, S>() == 1);
static_assert(Or1<S, int>() == 1);
static_assert(Or1<S, S>() == 1);
static_assert(Or1<int>() == 1); // expected-error {{no matching function for call to 'Or1'}}
- // expected-note@#or1 {{candidate template ignored: constraints not satisfied}} \
- // expected-note@#or1 {{because substituted constraint expression is ill-formed}}
-
+ // expected-note@#or1 {{candidate template ignored: constraints not satisfied}}
+ // expected-note@#or1 {{because 'typename T::type' does not satisfy 'C'}}
+ // expected-note@#C {{because 'T' does not satisfy 'A'}}
static_assert(Or2<S>() == 2);
static_assert(Or2<int, S>() == 2);
static_assert(Or2<S, int>() == 2);
static_assert(Or2<S, S>() == 2);
static_assert(Or2<int>() == 2); // expected-error {{no matching function for call to 'Or2'}}
- // expected-note@#or2 {{candidate template ignored: constraints not satisfied}} \
- // expected-note@#or2 {{because substituted constraint expression is ill-formed}}
-
+ // expected-note@#or2 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
+ // expected-note@#or2 {{because 'typename T::type' does not satisfy 'C'}}
+ // expected-note@#C {{because 'T' does not satisfy 'A'}}
static_assert(Or3<S>() == 3);
static_assert(Or3<int, S>() == 3);
static_assert(Or3<S, int>() == 3);
static_assert(Or3<S, S>() == 3);
static_assert(Or3<int>() == 3); // expected-error {{no matching function for call to 'Or3'}}
- // expected-note@#or3 {{candidate template ignored: constraints not satisfied}} \
- // expected-note@#or3 {{because substituted constraint expression is ill-formed}}
+ // expected-note@#or3 {{candidate template ignored: constraints not satisfied}}
+ // expected-note@#or3 {{because 'typename T::type' does not satisfy 'C'}}
+ // expected-note@#C {{because 'T' does not satisfy 'A'}}
}
namespace bool_conversion_break {
@@ -226,7 +255,7 @@ struct Thingy {
static constexpr int compare(const Thingy&) {return 1;}
};
template <typename ...T, typename ...U>
-void f(A<T ...> *, A<U ...> *) // expected-note {{candidate template ignored: failed template argument deduction}}
+void f(A<T ...> *, A<U ...> *) // expected-note {{candidate template ignored: constraints not satisfied}}
requires (T::compare(U{}) && ...); // expected-error {{atomic constraint must be of type 'bool' (found 'int')}}
void g() {
@@ -269,9 +298,7 @@ struct S {
static_assert(S<int>::f<int>() == 2);
-static_assert(S<int>::g<int>() == 2); // expected-error {{call to 'g' is ambiguous}}
- // expected-note@#nested-ambiguous-g1 {{candidate}}
- // expected-note@#nested-ambiguous-g2 {{candidate}}
+static_assert(S<int>::g<int>() == 2);
}
@@ -384,3 +411,98 @@ struct LazyLitMatrix<index_by<Indices...>, init> {
}
}
+
+namespace GH135190 {
+template <typename T>
+concept A = __is_same_as(T, int) || __is_same_as(T, double) ;
+
+template <typename T>
+concept B = A<T> && __is_same_as(T, double);
+
+template <class... Ts>
+requires(A<Ts> && ...)
+constexpr int g() {
+ return 1;
+}
+
+template <class... Ts>
+requires(B<Ts> && ...)
+constexpr int g() {
+ return 2;
+}
+
+static_assert(g<double>() == 2);
+
+
+template <class... Ts>
+concept all_A = (A<Ts> && ...);
+
+template <class... Ts>
+concept all_B = (B<Ts> && ...);
+
+template <class... Ts>
+requires all_A<Ts...>
+constexpr int h() {
+ return 1;
+}
+
+template <class... Ts>
+requires all_B<Ts...>
+constexpr int h() {
+ return 2;
+}
+
+static_assert(h<double>() == 2);
+}
+
+
+namespace parameter_mapping_regressions {
+
+namespace case1 {
+namespace std {
+template <class _Tp, class... _Args>
+constexpr bool is_constructible_v = __is_constructible(_Tp, _Args...);
+template <class _Tp, class... _Args>
+concept constructible_from = is_constructible_v<_Tp, _Args...>;
+template <class _Tp>
+concept default_initializable = true;
+template <class> using iterator_t = int;
+template <class _Tp>
+concept view = constructible_from<_Tp, _Tp>;
+template <class... _Views>
+ requires(view<_Views> && ...)
+class zip_transform_view;
+} // namespace std
+struct IterDefaultCtrView {};
+template <class... Views>
+using Iter = std::iterator_t<std::zip_transform_view<Views...>>;
+static_assert(
+ std::default_initializable<Iter<IterDefaultCtrView, IterDefaultCtrView>>);
+
+}
+
+namespace case2 {
+
+template <class _Bp>
+constexpr bool False = false;
+
+template <class... _Views>
+concept __zip_all_random_access = (False<_Views> && ...);
+// expected-note@-1 {{evaluated to false}}
+
+template <typename... _Views>
+struct zip_view {
+ void f() requires __zip_all_random_access<_Views...>{};
+ // expected-note@-1 {{because 'int' does not satisfy}}
+};
+
+zip_view<int> test_v;
+static_assert(!__zip_all_random_access<int>);
+
+void test() {
+ test_v.f(); // expected-error {{invalid reference to function 'f'}}
+}
+
+}
+
+}
diff --git a/clang/test/SemaCXX/cxx2c-template-template-param.cpp b/clang/test/SemaCXX/cxx2c-template-template-param.cpp
index ed55a059..4ad3fd9 100644
--- a/clang/test/SemaCXX/cxx2c-template-template-param.cpp
+++ b/clang/test/SemaCXX/cxx2c-template-template-param.cpp
@@ -106,7 +106,7 @@ concept BinaryDefaultedFalse = false;
template <template <typename...> concept C, typename T>
struct S {
- template <C TT> // expected-note {{because 'int' does not satisfy 'UnaryFalse'}}
+ template <C TT> // expected-note 2{{because 'int' does not satisfy 'UnaryFalse'}}
void f(TT); // expected-note {{ignored}}
void g(C auto); // expected-note {{ignored}} \
// expected-note {{because 'int' does not satisfy 'UnaryFalse'}}
@@ -171,7 +171,7 @@ concept BinaryDefaultedFalse = false;
template <template <typename...> concept C, typename T>
struct S {
- template <C TT> // expected-note {{because 'int' does not satisfy 'UnaryFalse'}}
+ template <C TT> // expected-note 2{{because 'int' does not satisfy 'UnaryFalse'}}
void f(TT); // expected-note {{ignored}}
void g(C auto); // expected-note {{ignored}} \
// expected-note {{because 'int' does not satisfy 'UnaryFalse'}}
diff --git a/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp b/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
index 436dfb9..8400340 100644
--- a/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
+++ b/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
@@ -1,6 +1,6 @@
// RUN: %clang -fsyntax-only -std=c++2a -Xclang -verify -ftemplate-depth=5 -ftemplate-backtrace-limit=4 %s
-// RequiresExpr contains invalid requirement. (Eg. Highly recurisive template).
+// RequiresExpr contains invalid requirement. (Eg. Highly recursive template).
template<int x>
struct A { static constexpr bool far(); };
class B {
@@ -19,7 +19,7 @@ constexpr bool A<x>::far() {
// expected-error@#Invalid {{recursive template instantiation exceeded maximum depth}}
// expected-note@#Invalid 3 {{while}}
// expected-note@#Invalid {{contexts in backtrace}}
- // expected-note@#Invalid {{increase recursive template instantiation depth}}
+ // expected-note@#Invalid {{use -ftemplate-depth=N to increase}}
};
}
static_assert(A<1>::far());
diff --git a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
index 135865c..c3bda39 100644
--- a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
+++ b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
@@ -102,7 +102,7 @@ static_assert(__is_constructible(Movable, int));
// expected-error@-1 {{no matching constructor for initialization of 'Movable'}} \
// expected-note@-1 2{{}}
// expected-error@#err-self-constraint-1{{satisfaction of constraint '__is_constructible(Movable, T)' depends on itself}}
-// expected-note@#err-self-constraint-1 4{{}}
+// expected-note@#err-self-constraint-1 3{{}}
// expected-note@#Movable {{'Movable' defined here}}
template <typename T>
@@ -200,7 +200,6 @@ void h(short n) { f(n); }
// expected-note@-1{{while checking constraint satisfaction for template}}
// expected-note@#GH62096-note1{{in instantiation}}
// expected-note@#GH62096-note1{{while substituting template arguments into constraint expression here}}
-// expected-note@#GH62096-note2{{while substituting template arguments into constraint expression here}}
// expected-note@#GH62096-note2{{while checking the satisfaction of concept}}
// expected-note@#GH62096-err {{expression evaluates}}
}
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index d49330f..901d510 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -5129,12 +5129,12 @@ namespace GH121278 {
#if __cplusplus >= 202002L
template <typename B, typename D>
concept C = __is_base_of(B, D);
-// expected-error@-1 {{incomplete type 'GH121278::S' used in type trait expression}}
+// expected-error@-1 {{incomplete type 'S' used in type trait expression}}
// expected-note@-2 {{while substituting template arguments into constraint expression here}}
struct T;
struct S;
bool b = C<T, S>;
-// expected-note@-1 {{while checking the satisfaction of concept 'C<GH121278::T, GH121278::S>' requested here}}
+// expected-note@-1 {{while checking the satisfaction of concept 'C<T, S>' requested here}}
#endif
}
diff --git a/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl b/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
index d7c6876..999372c 100644
--- a/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
@@ -19,7 +19,7 @@ Buffer<double2> r4;
// expected-error@+4 {{constraints not satisfied for class template 'Buffer'}}
// expected-note@*:* {{template declaration from hidden source: template <typename element_type> requires __is_typed_resource_element_compatible<element_type> class Buffer}}
-// expected-note@*:* {{because 'hlsl::Buffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'Buffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(hlsl::Buffer<int>)' evaluated to false}}
Buffer<Buffer<int> > r5;
@@ -65,7 +65,7 @@ Buffer<half[4]> r10;
typedef vector<int, 8> int8;
// expected-error@+3 {{constraints not satisfied for class template 'Buffer'}}
-// expected-note@*:* {{because 'vector<int, 8>' (vector of 8 'int' values) does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'int8' (aka 'vector<int, 8>') does not satisfy '__is_typed_resource_element_compatible'}}
// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<int, 8>)' evaluated to false}}
Buffer<int8> r11;
@@ -90,7 +90,7 @@ enum numbers { one, two, three };
Buffer<numbers> r15;
// expected-error@+3 {{constraints not satisfied for class template 'Buffer'}}
-// expected-note@*:* {{because 'vector<double, 3>' (vector of 3 'double' values) does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'double3' (aka 'vector<double, 3>') does not satisfy '__is_typed_resource_element_compatible'}}
// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<double, 3>)' evaluated to false}}
Buffer<double3> r16;
diff --git a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
index 361f4303..b33f2af 100644
--- a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
@@ -19,7 +19,7 @@ RWBuffer<double2> r4;
// expected-error@+4 {{constraints not satisfied for class template 'RWBuffer'}}
// expected-note@*:* {{template declaration from hidden source: template <typename element_type> requires __is_typed_resource_element_compatible<element_type> class RWBuffer}}
-// expected-note@*:* {{because 'hlsl::RWBuffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'RWBuffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(hlsl::RWBuffer<int>)' evaluated to false}}
RWBuffer<RWBuffer<int> > r5;
@@ -65,7 +65,7 @@ RWBuffer<half[4]> r10;
typedef vector<int, 8> int8;
// expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}}
-// expected-note@*:* {{because 'vector<int, 8>' (vector of 8 'int' values) does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'int8' (aka 'vector<int, 8>') does not satisfy '__is_typed_resource_element_compatible'}}
// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<int, 8>)' evaluated to false}}
RWBuffer<int8> r11;
@@ -90,7 +90,7 @@ enum numbers { one, two, three };
RWBuffer<numbers> r15;
// expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}}
-// expected-note@*:* {{because 'vector<double, 3>' (vector of 3 'double' values) does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'double3' (aka 'vector<double, 3>') does not satisfy '__is_typed_resource_element_compatible'}}
// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<double, 3>)' evaluated to false}}
RWBuffer<double3> r16;
diff --git a/clang/test/SemaTemplate/GH161657.cpp b/clang/test/SemaTemplate/GH161657.cpp
index 6ec7931..5ad4dde 100644
--- a/clang/test/SemaTemplate/GH161657.cpp
+++ b/clang/test/SemaTemplate/GH161657.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -std=c++20 -ffp-exception-behavior=strict -verify %s
+// RUN: %clang_cc1 -triple=x86_64 -fsyntax-only -std=c++20 -ffp-exception-behavior=strict -verify %s
// expected-no-diagnostics
template <class T> struct S {
diff --git a/clang/test/SemaTemplate/concepts-recovery-expr.cpp b/clang/test/SemaTemplate/concepts-recovery-expr.cpp
index 6bed179..aa4ed53 100644
--- a/clang/test/SemaTemplate/concepts-recovery-expr.cpp
+++ b/clang/test/SemaTemplate/concepts-recovery-expr.cpp
@@ -4,7 +4,7 @@
constexpr bool CausesRecoveryExpr = "test" + 1.0f;
template<typename T>
-concept ReferencesCRE = CausesRecoveryExpr;
+concept ReferencesCRE = CausesRecoveryExpr; // #subst1
template<typename T> requires CausesRecoveryExpr // #NVC1REQ
void NoViableCands1(){} // #NVC1
@@ -19,16 +19,18 @@ void NVCUse() {
NoViableCands1<int>();
// expected-error@-1 {{no matching function for call to 'NoViableCands1'}}
// expected-note@#NVC1{{candidate template ignored: constraints not satisfied}}
+ // expected-note@#NVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
// expected-note@#NVC1REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
NoViableCands2<int>();
// expected-error@-1 {{no matching function for call to 'NoViableCands2'}}
// expected-note@#NVC2{{candidate template ignored: constraints not satisfied}}
- // expected-note@#NVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+ // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
NoViableCands3<int>();
// expected-error@-1 {{no matching function for call to 'NoViableCands3'}}
// expected-note@#NVC3{{candidate template ignored: constraints not satisfied}}
- // expected-note@#NVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+ // expected-note@#NVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+ // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
}
template<typename T> requires CausesRecoveryExpr // #OVC1REQ
@@ -58,12 +60,14 @@ void OVCUse() {
// expected-error@-1 {{no matching function for call to 'OtherViableCands2'}}
// expected-note@#OVC2_ALT {{candidate function}}
// expected-note@#OVC2 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#OVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+ // expected-note@#OVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+ // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
OtherViableCands3<int>();
// expected-error@-1 {{no matching function for call to 'OtherViableCands3'}}
// expected-note@#OVC3_ALT {{candidate function}}
// expected-note@#OVC3 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#OVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+ // expected-note@#OVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+ // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
}
template<typename T> requires CausesRecoveryExpr // #OBNVC1REQ
@@ -95,13 +99,15 @@ void OBNVCUse() {
// expected-note@#OBNVC2_ALT {{candidate template ignored: constraints not satisfied}}
// expected-note@#OBNVC2REQ_ALT {{because 'false' evaluated to false}}
// expected-note@#OBNVC2 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#OBNVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+ // expected-note@#OBNVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+ // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
OtherBadNoViableCands3<int>();
// expected-error@-1 {{no matching function for call to 'OtherBadNoViableCands3'}}
// expected-note@#OBNVC3_ALT {{candidate template ignored: constraints not satisfied}}
// expected-note@#OBNVC3REQ_ALT {{because 'false' evaluated to false}}
// expected-note@#OBNVC3 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#OBNVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+ // expected-note@#OBNVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+ // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
}
@@ -136,12 +142,14 @@ void MemOVCUse() {
// expected-error@-1 {{no matching member function for call to 'OtherViableCands2'}}
// expected-note@#MEMOVC2_ALT {{candidate function}}
// expected-note@#MEMOVC2 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#MEMOVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+ // expected-note@#MEMOVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+ // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
S.OtherViableCands3<int>();
// expected-error@-1 {{no matching member function for call to 'OtherViableCands3'}}
// expected-note@#MEMOVC3_ALT {{candidate function}}
// expected-note@#MEMOVC3 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#MEMOVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+ // expected-note@#MEMOVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+ // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
}
struct StaticOVC {
@@ -173,12 +181,14 @@ void StaticMemOVCUse() {
// expected-error@-1 {{no matching function for call to 'OtherViableCands2'}}
// expected-note@#SMEMOVC2_ALT {{candidate function}}
// expected-note@#SMEMOVC2 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#SMEMOVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+ // expected-note@#SMEMOVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+ // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
StaticOVC::OtherViableCands3<int>();
// expected-error@-1 {{no matching function for call to 'OtherViableCands3'}}
// expected-note@#SMEMOVC3_ALT {{candidate function}}
// expected-note@#SMEMOVC3 {{candidate template ignored: constraints not satisfied}}
- // expected-note@#SMEMOVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+ // expected-note@#SMEMOVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+ // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
}
namespace GH58548 {
diff --git a/clang/test/SemaTemplate/concepts-recursive-inst.cpp b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
index 097cad1..73dce93 100644
--- a/clang/test/SemaTemplate/concepts-recursive-inst.cpp
+++ b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
@@ -12,7 +12,7 @@ void g() {
// expected-note@#FDEF{{because 'int' does not satisfy 'c'}}
// expected-note@#CDEF{{because 'f(t)' would be invalid: no matching function for call to 'f'}}
}
-} // namespace GH53213
+} // namespace GH53213
namespace GH45736 {
struct constrained;
@@ -67,15 +67,14 @@ struct my_range{
void baz() {
auto it = begin(rng); // #BEGIN_CALL
-// expected-error@#INF_BEGIN {{satisfaction of constraint 'Inf<Inf auto>' depends on itself}}
-// expected-note@#INF_BEGIN {{while substituting template arguments into constraint expression here}}
+// expected-error-re@#INF_REQ {{satisfaction of constraint {{.*}} depends on itself}}
+// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<DirectRecursiveCheck::my_range>' requested here}}
// expected-note@#INF_BEGIN_EXPR {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}}
// expected-note@#INF_BEGIN_EXPR {{while substituting deduced template arguments into function template 'begin'}}
// expected-note@#INF_BEGIN_EXPR {{in instantiation of requirement here}}
// expected-note@#INF_REQ {{while substituting template arguments into constraint expression here}}
-// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<DirectRecursiveCheck::my_range>' requested here}}
-// expected-note@#INF_BEGIN {{while substituting template arguments into constraint expression here}}
-// expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}}
+// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<struct my_range>' requested here}}
+// expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin<struct my_range>' required here}}
// expected-note@#BEGIN_CALL {{while substituting deduced template arguments into function template}}
// Fallout of the failure is failed lookup, which is necessary to stop odd
@@ -83,6 +82,7 @@ auto it = begin(rng); // #BEGIN_CALL
// expected-error@#BEGIN_CALL {{no matching function for call to 'begin'}}
// expected-note@#NOTINF_BEGIN {{candidate function}}
// expected-note@#INF_BEGIN{{candidate template ignored: constraints not satisfied}}
+// expected-note@#INF_BEGIN{{because 'Inf auto' does not satisfy 'Inf}}
}
} // namespace DirectRecursiveCheck
@@ -100,16 +100,17 @@ namespace GH50891 {
static_assert(Numeric<Deferred>); // #STATIC_ASSERT
// expected-error@#NUMERIC{{satisfaction of constraint 'requires (T a) { foo(a); }' depends on itself}}
// expected-note@#NUMERIC {{while substituting template arguments into constraint expression here}}
- // expected-note@#OP_TO {{while checking the satisfaction of concept 'Numeric<GH50891::Deferred>' requested here}}
- // expected-note@#OP_TO {{while substituting template arguments into constraint expression here}}
- // expected-note@#FOO_CALL {{while checking constraint satisfaction for template}}
- // expected-note@#FOO_CALL {{while substituting deduced template arguments into function template}}
- // expected-note@#FOO_CALL {{in instantiation of requirement here}}
+ // expected-note@#OP_TO {{while checking the satisfaction of concept 'Numeric<Deferred>' requested here}}
+ // expected-note@#OP_TO {{skipping 1 context}}
+ // expected-note@#FOO_CALL 2{{while checking constraint satisfaction for template}}
+ // expected-note@#FOO_CALL 2{{while substituting deduced template arguments into function template}}
+ // expected-note@#FOO_CALL 2{{in instantiation of requirement here}}
// expected-note@#NUMERIC {{while substituting template arguments into constraint expression here}}
// expected-error@#STATIC_ASSERT {{static assertion failed}}
- // expected-note@#STATIC_ASSERT{{while checking the satisfaction of concept 'Numeric<GH50891::Deferred>' requested here}}
- // expected-note@#STATIC_ASSERT{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+ // expected-note@#STATIC_ASSERT{{while checking the satisfaction of concept 'Numeric<Deferred>' requested here}}
+ // expected-note@#STATIC_ASSERT{{because 'Deferred' does not satisfy 'Numeric'}}
+ // expected-note@#FOO_CALL{{because 'foo(a)' would be invalid}}
} // namespace GH50891
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index 209e7dc..6d29f8b 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1002,7 +1002,7 @@ template<class>
concept Irrelevant = false;
template <typename T>
-concept ErrorRequires = requires(ErrorRequires auto x) { x; };
+concept ErrorRequires = requires(ErrorRequires auto x) { x; }; //#GH54678-ill-formed-concept
// expected-error@-1 {{a concept definition cannot refer to itself}} \
// expected-error@-1 {{'auto' not allowed in requires expression parameter}} \
// expected-note@-1 {{declared here}}
@@ -1023,8 +1023,7 @@ template<class T> void eee(T t) // expected-note {{candidate template ignored: c
requires (Irrelevant<T> || Irrelevant<T> || True<T>) && False<T> {} // expected-note {{'long' does not satisfy 'False'}}
template<class T> void fff(T t) // expected-note {{candidate template ignored: constraints not satisfied}}
-requires((ErrorRequires<T> || False<T> || True<T>) && False<T>) {} // expected-note {{'unsigned long' does not satisfy 'False'}}
-
+requires((ErrorRequires<T> || False<T> || True<T>) && False<T>) {} // expected-note {{because 'unsigned long' does not satisfy 'False'}}
void test() {
aaa(42); // expected-error {{no matching function}}
bbb(42L); // expected-error{{no matching function}}
@@ -1264,12 +1263,7 @@ C auto x = 0;
// expected-error@#T_Type {{type 'int' cannot be used prior to '::'}} \
// expected-note@-1 {{in instantiation of default argument}}
-// This will be fixed when we merge https://github.com/llvm/llvm-project/pull/141776
-// Which makes us behave like GCC.
static_assert(f(0));
-// expected-error@-1 {{no matching function for call}} \
-// expected-note@#GH61824_f {{constraints not satisfied}} \
-// expected-note@#T_Type {{type 'int' cannot be used prior to '::'}}
}
@@ -1278,4 +1272,65 @@ template <typename T> concept PerfectSquare = [](){} // expected-note 2{{here}}
([](auto) { return true; }) < PerfectSquare <class T>;
// expected-error@-1 {{declaration of 'T' shadows template parameter}} \
// expected-error@-1 {{a concept definition cannot refer to itself}}
+
+}
+namespace GH61811{
+template <class T> struct A { static const int x = 42; };
+template <class Ta> concept A42 = A<Ta>::x == 42;
+template <class Tv> concept Void = __is_same_as(Tv, void);
+template <class Tb, class Ub> concept A42b = Void<Tb> || A42<Ub>;
+template <class Tc> concept R42c = A42b<Tc, Tc&>;
+static_assert (R42c<void>);
+}
+
+namespace parameter_mapping_regressions {
+
+namespace case1 {
+
+template <template <class> class> using __meval = struct __q;
+template <template <class> class _Tp>
+concept __mvalid = requires { typename __meval<_Tp>; };
+template <class _Fn>
+concept __minvocable = __mvalid<_Fn::template __f>;
+template <class...> struct __mdefer_;
+template <class _Fn, class... _Args>
+ requires __minvocable<_Fn>
+struct __mdefer_<_Fn, _Args...> {};
+template <class = __q> struct __mtransform {
+ template <class> using __f = int;
+};
+struct __completion_domain_or_none_ : __mdefer_<__mtransform<>> {};
+
+}
+
+namespace case2 {
+
+template<auto& Q, class P> concept C = Q.template operator()<P>();
+template<class P> concept E = C<[]<class Ty>{ return false; }, P>;
+static_assert(!E<int>);
+
+}
+
+
+namespace case3 {
+template <class> constexpr bool is_move_constructible_v = false;
+
+template <class _Tp>
+concept __cpp17_move_constructible = is_move_constructible_v<_Tp>; // #is_move_constructible_v
+
+template <class _Tp>
+concept __cpp17_copy_constructible = __cpp17_move_constructible<_Tp>; // #__cpp17_move_constructible
+
+template <class _Iter>
+concept __cpp17_iterator = __cpp17_copy_constructible<_Iter>; // #__cpp17_copy_constructible
+
+struct not_move_constructible {};
+static_assert(__cpp17_iterator<not_move_constructible>); \
+// expected-error {{static assertion failed}} \
+// expected-note {{because 'not_move_constructible' does not satisfy '__cpp17_iterator'}} \
+// expected-note@#__cpp17_copy_constructible {{because 'not_move_constructible' does not satisfy '__cpp17_copy_constructible'}} \
+// expected-note@#__cpp17_move_constructible {{because 'parameter_mapping_regressions::case3::not_move_constructible' does not satisfy '__cpp17_move_constructible'}} \
+// expected-note@#is_move_constructible_v {{because 'is_move_constructible_v<parameter_mapping_regressions::case3::not_move_constructible>' evaluated to false}}
+}
+
}
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index e2b586e..9e5756f 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -574,8 +574,9 @@ static_assert(x.size == 4);
// CHECK-NEXT: | |-ParmVarDecl 0x{{.+}} <col:18, col:24> col:21 'U (&)[3]'
// CHECK-NEXT: | `-ConceptSpecializationExpr 0x{{.+}} <col:36, col:42> 'bool' Concept 0x{{.+}} 'True'
// CHECK-NEXT: | |-ImplicitConceptSpecializationDecl 0x{{.+}} <{{.+}}> col:28
-// CHECK-NEXT: | | `-TemplateArgument type 'type-parameter-0-0'
-// CHECK-NEXT: | | `-TemplateTypeParmType 0x{{.+}} 'type-parameter-0-0' dependent depth 0 index 0
+// CHECK-NEXT: | | `-TemplateArgument type 'T'
+// CHECK-NEXT: | | `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0
+// CHECK-NEXT: | | `-TemplateTypeParm 0x{{.+}} 'T'
// CHECK-NEXT: | `-TemplateArgument <{{.+}}> type 'T':'type-parameter-0-0'
// CHECK-NEXT: | `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0
// CHECK-NEXT: | `-TemplateTypeParm 0x{{.+}} 'T'
@@ -588,8 +589,9 @@ static_assert(x.size == 4);
// CHECK-NEXT: |-ParmVarDecl 0x{{.+}} <col:18, col:24> col:21 'double (&)[3]'
// CHECK-NEXT: `-ConceptSpecializationExpr 0x{{.+}} <col:36, col:42> 'bool' Concept 0x{{.+}} 'True'
// CHECK-NEXT: |-ImplicitConceptSpecializationDecl 0x{{.+}} <{{.+}}> col:28
-// CHECK-NEXT: | `-TemplateArgument type 'type-parameter-0-0'
-// CHECK-NEXT: | `-TemplateTypeParmType 0x{{.+}} 'type-parameter-0-0' dependent depth 0 index 0
+// CHECK-NEXT: | `-TemplateArgument type 'T'
+// CHECK-NEXT: | `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0
+// CHECK-NEXT: | `-TemplateTypeParm 0x{{.+}} 'T'
// CHECK-NEXT: `-TemplateArgument <{{.+}}> type 'T':'type-parameter-0-0'
// CHECK-NEXT: `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0
// CHECK-NEXT: `-TemplateTypeParm 0x{{.+}} 'T'
@@ -660,8 +662,9 @@ Test test(42);
// CHECK-NEXT: |-TemplateTypeParmDecl {{.*}} Concept {{.*}} 'Constraint' depth 0 index 1 auto:1
// CHECK-NEXT: | `-ConceptSpecializationExpr {{.*}} 'bool' Concept {{.*}} 'Constraint'
// CHECK-NEXT: | |-ImplicitConceptSpecializationDecl {{.*}}
-// CHECK-NEXT: | | |-TemplateArgument type 'type-parameter-0-1'
-// CHECK-NEXT: | | | `-TemplateTypeParmType {{.*}} 'type-parameter-0-1' dependent depth 0 index 1
+// CHECK-NEXT: | | |-TemplateArgument type 'auto:1'
+// CHECK-NEXT: | | | `-TemplateTypeParmType {{.*}} 'auto:1' dependent depth 0 index 1
+// CHECK-NEXT: | | | `-TemplateTypeParm {{.*}} 'auto:1'
// CHECK-NEXT: | | `-TemplateArgument type 'int'
// CHECK-NEXT: | | `-BuiltinType {{.*}} 'int'
// CHECK-NEXT: | |-TemplateArgument {{.*}} type 'auto:1':'type-parameter-0-1'
diff --git a/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp b/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp
index 1f2171a..e03756e 100644
--- a/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp
+++ b/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp
@@ -1,5 +1,6 @@
// RUN: %clang_cc1 -std=c++2a -x c++ %s -verify
+
template<typename...>
concept C = false; // expected-note 9{{because}}
diff --git a/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp b/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp
index 3edf243..de4a484 100644
--- a/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp
+++ b/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp
@@ -7,8 +7,7 @@ template<typename T>
constexpr bool is_same_v<T, T> = true;
template<typename T, typename U>
-concept same_as = is_same_v<T, U>;
-// expected-note@-1{{because 'is_same_v<int, bool>' evaluated to false}}
+concept same_as = is_same_v<T, U>; //#is_same_v
template<typename T, typename... Us>
concept either = (is_same_v<T, Us> || ...);
@@ -17,6 +16,7 @@ template<typename... Ts>
struct T {
template<same_as<Ts>... Us>
// expected-note@-1{{because 'same_as<int, bool>' evaluated to false}}
+ // expected-note@#is_same_v{{because 'is_same_v<int, bool>' evaluated to false}}
static void foo(Us... u, int x) { };
// expected-note@-1{{candidate template ignored: deduced too few arguments}}
// expected-note@-2{{candidate template ignored: constraints not satisfied}}
diff --git a/clang/test/SemaTemplate/instantiate-requires-expr.cpp b/clang/test/SemaTemplate/instantiate-requires-expr.cpp
index e60f792..32ad537 100644
--- a/clang/test/SemaTemplate/instantiate-requires-expr.cpp
+++ b/clang/test/SemaTemplate/instantiate-requires-expr.cpp
@@ -72,12 +72,12 @@ namespace type_requirement {
template<typename T> requires
false_v<requires { typename T::template temp<T>; }>
- // expected-note@-1 {{because 'false_v<requires { typename type_requirement::contains_template<int>::template temp<type_requirement::contains_template<int>>; }>' evaluated to false}}
- // expected-note@-2 {{because 'false_v<requires { typename type_requirement::contains_template<short>::template temp<type_requirement::contains_template<short>>; }>' evaluated to false}}
+ // expected-note@-1 {{because 'false_v<requires { typename contains_template<int>::template temp<contains_template<int>>; }>' evaluated to false}}
+ // expected-note@-2 {{because 'false_v<requires { typename contains_template<short>::template temp<contains_template<short>>; }>' evaluated to false}}
struct r2 {};
- using r2i1 = r2<contains_template<int>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = type_requirement::contains_template<int>]}}
- using r2i2 = r2<contains_template<short>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = type_requirement::contains_template<short>]}}
+ using r2i1 = r2<contains_template<int>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = contains_template<int>]}}
+ using r2i2 = r2<contains_template<short>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = contains_template<short>]}}
// substitution error occurs, then requires expr is instantiated again
@@ -108,7 +108,7 @@ namespace type_requirement {
// expected-note@-1 {{because 'false_v<requires { <<error-type>>; } && requires { <<error-type>>; }>' evaluated to false}}
struct r7 {};
- using r7i = r7<int, A>; // expected-error{{constraints not satisfied for class template 'r7' [with Ts = <int, type_requirement::A>]}}
+ using r7i = r7<int, A>; // expected-error{{constraints not satisfied for class template 'r7' [with Ts = <int, A>]}}
}
namespace expr_requirement {
@@ -268,3 +268,13 @@ struct Foo {
};
} // namespace GH110785
+
+namespace sugared_instantiation {
+ template <class C1> concept C = requires { C1{}; };
+ template <class D1> concept D = requires { new D1; };
+
+ // Test that 'deduced auto' doesn't get confused with 'undeduced auto'.
+ auto f() { return 0; }
+ static_assert(requires { { f() } -> C; });
+ static_assert(requires { { f() } -> D; });
+} // namespace sugared_instantiation
diff --git a/clang/test/SemaTemplate/instantiate-template-argument.cpp b/clang/test/SemaTemplate/instantiate-template-argument.cpp
index 43d5d00..7606619 100644
--- a/clang/test/SemaTemplate/instantiate-template-argument.cpp
+++ b/clang/test/SemaTemplate/instantiate-template-argument.cpp
@@ -1,4 +1,6 @@
-// RUN: %clang_cc1 -std=c++2a -x c++ %s -verify
+// RUN: %clang_cc1 -std=c++2a -x c++ %s -verify=expected,cxx20
+// RUN: %clang_cc1 -std=c++2c -x c++ %s -verify
+
template<auto T, decltype(T) U>
concept C1 = sizeof(U) >= 4;
@@ -9,20 +11,101 @@ concept C2 = C1<Y{}, V>;
// sizeof(U) >= 4 [U = V (decltype(Y{}))]
template<char W>
-constexpr int foo() requires C2<int, W> { return 1; }
+constexpr int foo() requires C2<int, W> { return 1; } // #cand1
// sizeof(U) >= 4 [U = W (decltype(int{}))]
template<char X>
-// expected-note@+1{{candidate function}}
-constexpr int foo() requires C1<1, X> && true { return 2; }
+constexpr int foo() requires C1<1, X> && true { return 2; } // #cand2
// sizeof(U) >= 4 [U = X (decltype(1))]
static_assert(foo<'a'>() == 2);
+
template<char Z>
-// expected-note@+1{{candidate function}}
-constexpr int foo() requires C2<long long, Z> && true { return 3; }
+constexpr int foo() requires C2<long long, Z> && true { return 3; } // #cand3
// sizeof(U) >= 4 [U = Z (decltype(long long{}))]
static_assert(foo<'a'>() == 3);
-// expected-error@-1{{call to 'foo' is ambiguous}} \ No newline at end of file
+// expected-error@-1{{call to 'foo' is ambiguous}}
+// expected-note@#cand2 {{candidate function}}
+// expected-note@#cand3 {{candidate function}}
+
+
+namespace case1 {
+
+template<auto T, decltype(T) U>
+concept C1 = sizeof(T) >= 4; // #case1_C1
+
+template<typename Y, char V>
+concept C2 = C1<Y{}, V>; // #case1_C2
+
+template<class T, char W>
+constexpr int foo() requires C2<T, W> { return 1; } // #case1_foo1
+
+template<class T, char X>
+constexpr int foo() requires C1<T{}, X> && true { return 2; } // #case1_foo2
+
+static_assert(foo<char, 'a'>() == 2);
+// expected-error@-1{{no matching function for call to 'foo'}}
+// expected-note@#case1_foo1{{candidate template ignored: constraints not satisfied [with T = char, W = 'a']}}
+// expected-note@#case1_foo1{{because 'C2<char, 'a'>' evaluated to false}}
+// expected-note@#case1_C2{{because 'C1<char{}, 'a'>' evaluated to false}}
+// expected-note@#case1_C1{{because 'sizeof ('\x00') >= 4' (1 >= 4) evaluated to false}}
+// expected-note@#case1_foo2{{candidate template ignored: constraints not satisfied [with T = char, X = 'a']}}
+// expected-note@#case1_foo2{{because 'C1<char{}, 'a'>' evaluated to false}}
+// expected-note@#case1_C1{{because 'sizeof ('\x00') >= 4' (1 >= 4) evaluated to false}}
+
+static_assert(foo<int, 'a'>() == 2);
+
+}
+
+namespace packs {
+
+template<auto T, decltype(T) U>
+concept C1 = sizeof(U) >= 4;
+
+template<typename Y, char V>
+concept C2 = C1<Y{}, V>;
+
+template<char... W>
+constexpr int foo() requires (C2<int, W> && ...) { return 1; } // #packs-cand1
+
+template<char... X>
+constexpr int foo() requires (C1<1, X> && ...) && true { return 2; } // #packs-cand2
+
+static_assert(foo<'a'>() == 2);
+// cxx20-error@-1{{call to 'foo' is ambiguous}}
+// cxx20-note@#packs-cand1 {{candidate function}}
+// cxx20-note@#packs-cand2 {{candidate function}}
+
+}
+
+namespace case2 {
+template<auto T> concept C1 = sizeof(decltype(T)) >= 0;
+template<typename Y> concept C2 = C1<Y{}>;
+
+template<char W>
+constexpr int foo() requires C2<int> { return 1; }
+
+template<char X>
+constexpr int foo() requires C1<0> && true { return 2; }
+
+static_assert(foo<0>() == 2);
+}
+
+namespace case3 {
+template<auto T> concept C1 = sizeof(decltype(T)) >= 0;
+
+template<typename Y> concept C2 = C1<Y{}>;
+
+template<char W>
+constexpr int foo() requires C2<int> { return 1; } // #case3_foo1
+
+template<char X>
+constexpr int foo() requires C1<1> && true { return 2; } // #case3_foo2
+
+static_assert(foo<0>() == 2);
+// expected-error@-1{{call to 'foo' is ambiguous}}
+// expected-note@#case3_foo1 {{candidate function}}
+// expected-note@#case3_foo2 {{candidate function}}
+}
diff --git a/clang/test/SemaTemplate/pr52970.cpp b/clang/test/SemaTemplate/pr52970.cpp
index 7aac5ee..6aabc41 100644
--- a/clang/test/SemaTemplate/pr52970.cpp
+++ b/clang/test/SemaTemplate/pr52970.cpp
@@ -53,7 +53,7 @@ static_assert(!DotFollowingPointer::f(Bad{}), "");
#if __cplusplus >= 202002L
template <class T>
concept C = requires(T t) { t.begin(); };
- // cxx20-note@-1 {{because 't.begin()' would be invalid: member reference type 'Holder<Incomplete> *' is a pointer}}
+ // cxx20-note@-1 {{because 't.begin()' would be invalid: member reference type 'Bad' (aka 'Holder<Incomplete> *') is a pointer}}
static_assert(C<Good>);
static_assert(!C<Bad>);
diff --git a/flang-rt/lib/runtime/character.cpp b/flang-rt/lib/runtime/character.cpp
index 98a225d..0f9f419 100644
--- a/flang-rt/lib/runtime/character.cpp
+++ b/flang-rt/lib/runtime/character.cpp
@@ -789,7 +789,7 @@ void RTDEF(LenTrim)(Descriptor &result, const Descriptor &string, int kind,
std::size_t RTDEF(Scan1)(const char *x, std::size_t xLen, const char *set,
std::size_t setLen, bool back) {
- return ScanVerify<char, CharFunc::Scan>(x, xLen, set, setLen, back);
+ return ScanVerify<false>(x, xLen, set, setLen, back);
}
std::size_t RTDEF(Scan2)(const char16_t *x, std::size_t xLen,
const char16_t *set, std::size_t setLen, bool back) {
@@ -873,7 +873,7 @@ void RTDEF(Trim)(Descriptor &result, const Descriptor &string,
std::size_t RTDEF(Verify1)(const char *x, std::size_t xLen, const char *set,
std::size_t setLen, bool back) {
- return ScanVerify<char, CharFunc::Verify>(x, xLen, set, setLen, back);
+ return ScanVerify<true>(x, xLen, set, setLen, back);
}
std::size_t RTDEF(Verify2)(const char16_t *x, std::size_t xLen,
const char16_t *set, std::size_t setLen, bool back) {
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index bdf7e4a..e006d2e 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -285,11 +285,16 @@ mlir::LLVM::DIModuleAttr AddDebugInfoPass::getOrCreateModuleAttr(
if (auto iter{moduleMap.find(name)}; iter != moduleMap.end()) {
modAttr = iter->getValue();
} else {
+ // When decl is true, it means that module is only being used in this
+ // compilation unit and it is defined elsewhere. But if the file/line/scope
+ // fields are valid, the module is not merged with its definition and is
+ // considered different. So we only set those fields when decl is false.
modAttr = mlir::LLVM::DIModuleAttr::get(
- context, fileAttr, scope, mlir::StringAttr::get(context, name),
+ context, decl ? nullptr : fileAttr, decl ? nullptr : scope,
+ mlir::StringAttr::get(context, name),
/* configMacros */ mlir::StringAttr(),
/* includePath */ mlir::StringAttr(),
- /* apinotes */ mlir::StringAttr(), line, decl);
+ /* apinotes */ mlir::StringAttr(), decl ? 0 : line, decl);
moduleMap[name] = modAttr;
}
return modAttr;
diff --git a/flang/test/Transforms/debug-module-3.fir b/flang/test/Transforms/debug-module-3.fir
new file mode 100644
index 0000000..03cc21e
--- /dev/null
+++ b/flang/test/Transforms/debug-module-3.fir
@@ -0,0 +1,13 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
+
+module {
+ func.func @_QQmain() {
+ %2 = fir.address_of(@_QMmodEvar1) : !fir.ref<i32> loc(#loc1)
+ %3 = fircg.ext_declare %2 {uniq_name = "_QMmodEvar1"} : (!fir.ref<i32>) -> !fir.ref<i32> loc(#loc1)
+ return
+ } loc(#loc1)
+ fir.global @_QMmodEvar1 : i32 loc(#loc1)
+}
+#loc1 = loc("test1.f90":1:0)
+
+// CHECK: #llvm.di_module<name = "mod", isDecl = true>
diff --git a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
index 629a887..70341ee 100644
--- a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
@@ -143,7 +143,7 @@ void check_forward_iterator_requirements() {
// expected-note@*:* {{because 'not_default_constructible' does not satisfy '__cpp17_default_constructible'}}
_LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(postincrement_not_ref, ""); // expected-error {{static assertion failed}}
#ifndef _AIX
- // expected-note-re@*:* {{because type constraint 'convertible_to<{{(valid_iterator<postincrement_not_ref>::)?}}Proxy, const postincrement_not_ref &>' was not satisfied}}
+ // expected-note-re@*:* {{'convertible_to<{{(valid_iterator<postincrement_not_ref>::)?}}Proxy, const postincrement_not_ref &>'}}
#endif
}
@@ -173,7 +173,7 @@ void check_bidirectional_iterator_requirements() {
_LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_postdecrement, ""); // expected-error {{static assertion failed}}
// expected-note@*:* {{cannot decrement value of type 'missing_postdecrement'}}
_LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(not_returning_iter_reference, ""); // expected-error {{static assertion failed}}
- // expected-note-re@*:* {{because type constraint 'same_as<int, __iter_reference<not_returning_iter_reference>{{ ?}}>' was not satisfied}}
+ // expected-note-re@*:* {{'same_as<int, __iter_reference<not_returning_iter_reference>{{ ?}}>'}}
// clang-format on
}
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 6168e24..2e31fe5 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2773,6 +2773,14 @@ m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2,
return m_Intrinsic<Intrinsic::masked_load>(Op0, Op1, Op2, Op3);
}
+/// Matches MaskedStore Intrinsic.
+template <typename Opnd0, typename Opnd1, typename Opnd2, typename Opnd3>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2, Opnd3>::Ty
+m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2,
+ const Opnd3 &Op3) {
+ return m_Intrinsic<Intrinsic::masked_store>(Op0, Op1, Op2, Op3);
+}
+
/// Matches MaskedGather Intrinsic.
template <typename Opnd0, typename Opnd1, typename Opnd2, typename Opnd3>
inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2, Opnd3>::Ty
diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h
index 2454149..74a4d6c 100644
--- a/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -56,6 +56,7 @@ class OptimizationRemarkEmitter;
class PHINode;
class TargetLibraryInfo;
class Value;
+class IntrinsicInst;
/// A private "module" namespace for types and utilities used by GVN. These
/// are implementation details and should not be used by clients.
namespace LLVM_LIBRARY_VISIBILITY_NAMESPACE gvn {
@@ -349,6 +350,7 @@ private:
// Helper functions of redundant load elimination.
bool processLoad(LoadInst *L);
+ bool processMaskedLoad(IntrinsicInst *I);
bool processNonLocalLoad(LoadInst *L);
bool processAssumeIntrinsic(AssumeInst *II);
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
index 23b72da..6e316f1 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
@@ -280,6 +280,9 @@ std::vector<Block *> LinkGraph::splitBlockImpl(std::vector<Block *> Blocks,
void LinkGraph::dump(raw_ostream &OS) {
DenseMap<Block *, std::vector<Symbol *>> BlockSymbols;
+ OS << "LinkGraph \"" << getName()
+ << "\" (triple = " << getTargetTriple().str() << ")\n";
+
// Map from blocks to the symbols pointing at them.
for (auto *Sym : defined_symbols())
BlockSymbols[&Sym->getBlock()].push_back(Sym);
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
index 584b9f0..17050b0 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
@@ -21,23 +21,21 @@ JITLinkerBase::~JITLinkerBase() = default;
void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
- LLVM_DEBUG({
- dbgs() << "Starting link phase 1 for graph " << G->getName() << "\n";
- });
+ LLVM_DEBUG(dbgs() << "Starting link phase 1\n");
// Prune and optimize the graph.
if (auto Err = runPasses(Passes.PrePrunePasses))
return Ctx->notifyFailed(std::move(Err));
LLVM_DEBUG({
- dbgs() << "Link graph \"" << G->getName() << "\" pre-pruning:\n";
+ dbgs() << "Link graph pre-pruning:\n";
G->dump(dbgs());
});
prune(*G);
LLVM_DEBUG({
- dbgs() << "Link graph \"" << G->getName() << "\" post-pruning:\n";
+ dbgs() << "Link graph post-pruning:\n";
G->dump(dbgs());
});
@@ -67,14 +65,15 @@ void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self,
AllocResult AR) {
+ LLVM_DEBUG(dbgs() << "Starting link phase 2\n");
+
if (AR)
Alloc = std::move(*AR);
else
return Ctx->notifyFailed(AR.takeError());
LLVM_DEBUG({
- dbgs() << "Link graph \"" << G->getName()
- << "\" before post-allocation passes:\n";
+ dbgs() << "Link graph before post-allocation passes:\n";
G->dump(dbgs());
});
@@ -131,9 +130,7 @@ void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self,
void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self,
Expected<AsyncLookupResult> LR) {
- LLVM_DEBUG({
- dbgs() << "Starting link phase 3 for graph " << G->getName() << "\n";
- });
+ LLVM_DEBUG(dbgs() << "Starting link phase 3\n");
// If the lookup failed, bail out.
if (!LR)
@@ -143,8 +140,7 @@ void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self,
applyLookupResult(*LR);
LLVM_DEBUG({
- dbgs() << "Link graph \"" << G->getName()
- << "\" before pre-fixup passes:\n";
+ dbgs() << "Link graph before pre-fixup passes:\n";
G->dump(dbgs());
});
@@ -152,7 +148,7 @@ void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self,
return abandonAllocAndBailOut(std::move(Self), std::move(Err));
LLVM_DEBUG({
- dbgs() << "Link graph \"" << G->getName() << "\" before copy-and-fixup:\n";
+ dbgs() << "Link graph before copy-and-fixup:\n";
G->dump(dbgs());
});
@@ -161,7 +157,7 @@ void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self,
return abandonAllocAndBailOut(std::move(Self), std::move(Err));
LLVM_DEBUG({
- dbgs() << "Link graph \"" << G->getName() << "\" after copy-and-fixup:\n";
+ dbgs() << "Link graph after copy-and-fixup:\n";
G->dump(dbgs());
});
@@ -186,16 +182,14 @@ void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self,
void JITLinkerBase::linkPhase4(std::unique_ptr<JITLinkerBase> Self,
FinalizeResult FR) {
- LLVM_DEBUG({
- dbgs() << "Starting link phase 4 for graph " << G->getName() << "\n";
- });
+ LLVM_DEBUG(dbgs() << "Starting link phase 4\n");
if (!FR)
return Ctx->notifyFailed(FR.takeError());
Ctx->notifyFinalized(std::move(*FR));
- LLVM_DEBUG({ dbgs() << "Link of graph " << G->getName() << " complete\n"; });
+ LLVM_DEBUG({ dbgs() << "Link complete\n"; });
}
Error JITLinkerBase::runPasses(LinkGraphPassList &Passes) {
diff --git a/llvm/lib/Object/BuildID.cpp b/llvm/lib/Object/BuildID.cpp
index 89d6bc3..d1ee597 100644
--- a/llvm/lib/Object/BuildID.cpp
+++ b/llvm/lib/Object/BuildID.cpp
@@ -24,6 +24,24 @@ using namespace llvm::object;
namespace {
template <typename ELFT> BuildIDRef getBuildID(const ELFFile<ELFT> &Obj) {
+ auto findBuildID = [&Obj](const auto &ShdrOrPhdr,
+ uint64_t Alignment) -> std::optional<BuildIDRef> {
+ Error Err = Error::success();
+ for (auto N : Obj.notes(ShdrOrPhdr, Err))
+ if (N.getType() == ELF::NT_GNU_BUILD_ID &&
+ N.getName() == ELF::ELF_NOTE_GNU)
+ return N.getDesc(Alignment);
+ consumeError(std::move(Err));
+ return std::nullopt;
+ };
+
+ auto Sections = cantFail(Obj.sections());
+ for (const auto &S : Sections) {
+ if (S.sh_type != ELF::SHT_NOTE)
+ continue;
+ if (std::optional<BuildIDRef> ShdrRes = findBuildID(S, S.sh_addralign))
+ return ShdrRes.value();
+ }
auto PhdrsOrErr = Obj.program_headers();
if (!PhdrsOrErr) {
consumeError(PhdrsOrErr.takeError());
@@ -32,12 +50,8 @@ template <typename ELFT> BuildIDRef getBuildID(const ELFFile<ELFT> &Obj) {
for (const auto &P : *PhdrsOrErr) {
if (P.p_type != ELF::PT_NOTE)
continue;
- Error Err = Error::success();
- for (auto N : Obj.notes(P, Err))
- if (N.getType() == ELF::NT_GNU_BUILD_ID &&
- N.getName() == ELF::ELF_NOTE_GNU)
- return N.getDesc(P.p_align);
- consumeError(std::move(Err));
+ if (std::optional<BuildIDRef> PhdrRes = findBuildID(P, P.p_align))
+ return PhdrRes.value();
}
return {};
}
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 8d6eb91..4357264d 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -282,7 +282,7 @@ static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
static cl::opt<bool>
SplitSVEObjects("aarch64-split-sve-objects",
cl::desc("Split allocation of ZPR & PPR objects"),
- cl::init(false), cl::Hidden);
+ cl::init(true), cl::Hidden);
cl::opt<bool> EnableHomogeneousPrologEpilog(
"homogeneous-prolog-epilog", cl::Hidden,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 8c4b4f6..50a8754 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5632,75 +5632,94 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
TTI::TargetCostKind CostKind) const {
InstructionCost Invalid = InstructionCost::getInvalid();
- InstructionCost Cost(TTI::TCC_Basic);
if (CostKind != TTI::TCK_RecipThroughput)
return Invalid;
- // Sub opcodes currently only occur in chained cases.
- // Independent partial reduction subtractions are still costed as an add
+ if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
+ (!ST->isNeonAvailable() || !ST->hasDotProd()))
+ return Invalid;
+
if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
OpAExtend == TTI::PR_None)
return Invalid;
+ assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
+ (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
+ "Unexpected values for OpBExtend or InputTypeB");
+
// We only support multiply binary operations for now, and for muls we
// require the types being extended to be the same.
- // NOTE: For muls AArch64 supports lowering mixed extensions to a usdot but
- // only if the i8mm or sve/streaming features are available.
- if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB ||
- OpBExtend == TTI::PR_None ||
- (OpAExtend != OpBExtend && !ST->hasMatMulInt8() &&
- !ST->isSVEorStreamingSVEAvailable())))
+ if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB))
return Invalid;
- assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
- "Unexpected values for OpBExtend or InputTypeB");
- EVT InputEVT = EVT::getEVT(InputTypeA);
- EVT AccumEVT = EVT::getEVT(AccumType);
+ bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
+ if (IsUSDot && !ST->hasMatMulInt8())
+ return Invalid;
+
+ unsigned Ratio =
+ AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
+ if (VF.getKnownMinValue() <= Ratio)
+ return Invalid;
+
+ VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
+ VectorType *AccumVectorType =
+ VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
+ // We don't yet support all kinds of legalization.
+ auto TA = TLI->getTypeAction(AccumVectorType->getContext(),
+ EVT::getEVT(AccumVectorType));
+ switch (TA) {
+ default:
+ return Invalid;
+ case TargetLowering::TypeLegal:
+ case TargetLowering::TypePromoteInteger:
+ case TargetLowering::TypeSplitVector:
+ break;
+ }
+
+ // Check what kind of type-legalisation happens.
+ std::pair<InstructionCost, MVT> AccumLT =
+ getTypeLegalizationCost(AccumVectorType);
+ std::pair<InstructionCost, MVT> InputLT =
+ getTypeLegalizationCost(InputVectorType);
- unsigned VFMinValue = VF.getKnownMinValue();
+ InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
- if (VF.isScalable()) {
- if (!ST->isSVEorStreamingSVEAvailable())
- return Invalid;
+ // Prefer using full types by costing half-full input types as more expensive.
+ if (TypeSize::isKnownLT(InputVectorType->getPrimitiveSizeInBits(),
+ TypeSize::getScalable(128)))
+ // FIXME: This can be removed after the cost of the extends are folded into
+ // the dot-product expression in VPlan, after landing:
+ // https://github.com/llvm/llvm-project/pull/147302
+ Cost *= 2;
- // Don't accept a partial reduction if the scaled accumulator is vscale x 1,
- // since we can't lower that type.
- unsigned Scale =
- AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits();
- if (VFMinValue == Scale)
- return Invalid;
+ if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
+ // i16 -> i64 is natively supported for udot/sdot
+ if (AccumLT.second.getScalarType() == MVT::i64 &&
+ InputLT.second.getScalarType() == MVT::i16)
+ return Cost;
+ // i8 -> i64 is supported with an extra level of extends
+ if (AccumLT.second.getScalarType() == MVT::i64 &&
+ InputLT.second.getScalarType() == MVT::i8)
+ // FIXME: This cost should probably be a little higher, e.g. Cost + 2
+ // because it requires two extra extends on the inputs. But if we'd change
+ // that now, a regular reduction would be cheaper because the costs of
+ // the extends in the IR are still counted. This can be fixed
+ // after https://github.com/llvm/llvm-project/pull/147302 has landed.
+ return Cost;
}
- if (VF.isFixed() &&
- (!ST->isNeonAvailable() || !ST->hasDotProd() || AccumEVT == MVT::i64))
- return Invalid;
- if (InputEVT == MVT::i8) {
- switch (VFMinValue) {
- default:
- return Invalid;
- case 8:
- if (AccumEVT == MVT::i32)
- Cost *= 2;
- else if (AccumEVT != MVT::i64)
- return Invalid;
- break;
- case 16:
- if (AccumEVT == MVT::i64)
- Cost *= 2;
- else if (AccumEVT != MVT::i32)
- return Invalid;
- break;
- }
- } else if (InputEVT == MVT::i16) {
- // FIXME: Allow i32 accumulator but increase cost, as we would extend
- // it to i64.
- if (VFMinValue != 8 || AccumEVT != MVT::i64)
- return Invalid;
- } else
- return Invalid;
+ // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE.
+ if (ST->isSVEorStreamingSVEAvailable() ||
+ (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
+ ST->hasDotProd())) {
+ if (AccumLT.second.getScalarType() == MVT::i32 &&
+ InputLT.second.getScalarType() == MVT::i8)
+ return Cost;
+ }
- return Cost;
+ // Add additional cost for the extends that would need to be inserted.
+ return Cost + 4;
}
InstructionCost
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 6efa78e..a4ef524 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -608,8 +608,6 @@ public:
? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot]
: EmptySet;
- const size_t HybridModuleRootKernelsSize = HybridModuleRootKernels.size();
-
for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
// Each iteration of this loop assigns exactly one global variable to
// exactly one of the implementation strategies.
@@ -649,8 +647,7 @@ public:
ModuleScopeVariables.insert(GV);
} else if (K.second.size() == 1) {
KernelAccessVariables.insert(GV);
- } else if (K.second.size() == HybridModuleRootKernelsSize &&
- set_is_subset(K.second, HybridModuleRootKernels)) {
+ } else if (K.second == HybridModuleRootKernels) {
ModuleScopeVariables.insert(GV);
} else {
TableLookupVariables.insert(GV);
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 76bfce8..5e27b37 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1013,6 +1013,15 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
}
}
} else if (T == X_CNT) {
+ WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
+ if (PendingEvents & (1 << OtherEvent)) {
+ // Hardware inserts an implicit xcnt between interleaved
+ // SMEM and VMEM operations. So there will never be
+ // outstanding address translations for both SMEM and
+ // VMEM at the same time.
+ setScoreLB(T, CurrScore - 1);
+ PendingEvents &= ~(1 << OtherEvent);
+ }
for (const MachineOperand &Op : Inst.all_uses())
setScoreByOperand(&Inst, Op, T, CurrScore);
} else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
@@ -2220,6 +2229,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
// Now look at the instruction opcode. If it is a memory access
// instruction, update the upper-bound of the appropriate counter's
// bracket and the destination operand scores.
+ // For architectures with X_CNT, mark the source address operands
+ // with the appropriate counter values.
// TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
bool IsVMEMAccess = false;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 8f1dd62..5630580 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1163,6 +1163,22 @@ def VS_64_Lo256 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32,
let HasSGPR = 1;
let Size = 64;
}
+
+def VS_128 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32,
+ (add VReg_128, SReg_128)> {
+ let isAllocatable = 0;
+ let HasVGPR = 1;
+ let HasSGPR = 1;
+ let Size = 128;
+}
+
+def VS_128_Align2 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32,
+ (add VReg_128_Align2, SReg_128)> {
+ let isAllocatable = 0;
+ let HasVGPR = 1;
+ let HasSGPR = 1;
+ let Size = 128;
+}
} // End GeneratePressureSet = 0
// Define a register tuple class, along with one requiring an even
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index fa130a1..26ff54c 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -775,6 +775,16 @@ class VectorType;
bool shouldFoldConstantShiftPairToMask(const SDNode *N,
CombineLevel Level) const override;
+ /// Return true if it is profitable to fold a pair of shifts into a mask.
+ bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override {
+ EVT VT = Y.getValueType();
+
+ if (VT.isVector())
+ return false;
+
+ return VT.getScalarSizeInBits() <= 32;
+ }
+
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
unsigned SelectOpcode, SDValue X,
SDValue Y) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index af1ceb6..cf6f83a 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -110,16 +110,16 @@ def : StPat<truncstorei8, SB, GPR, i16>;
let Predicates = [HasAtomicLdSt] in {
// Prefer unsigned due to no c.lb in Zcb.
- def : LdPat<atomic_load_aext_8, LBU, i16>;
- def : LdPat<atomic_load_nonext_16, LH, i16>;
+ def : LdPat<relaxed_load<atomic_load_aext_8>, LBU, i16>;
+ def : LdPat<relaxed_load<atomic_load_nonext_16>, LH, i16>;
- def : StPat<atomic_store_8, SB, GPR, i16>;
- def : StPat<atomic_store_16, SH, GPR, i16>;
+ def : StPat<relaxed_store<atomic_store_8>, SB, GPR, i16>;
+ def : StPat<relaxed_store<atomic_store_16>, SH, GPR, i16>;
}
let Predicates = [HasAtomicLdSt, IsRV64] in {
// Load pattern is in RISCVInstrInfoA.td and shared with RV32.
- def : StPat<atomic_store_32, SW, GPR, i32>;
+ def : StPat<relaxed_store<atomic_store_32>, SW, GPR, i32>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index 143c4c4..e7709ef 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -149,6 +149,10 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
});
}
+ getActionDefinitionsBuilder({G_UMIN, G_UMAX, G_SMIN, G_SMAX})
+ .widenScalarToNextPow2(0, /*Min=*/32)
+ .lower();
+
// integer addition/subtraction
getActionDefinitionsBuilder({G_ADD, G_SUB})
.legalFor({s8, s16, s32})
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cda5568..3802506 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45457,7 +45457,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
const SDLoc &DL,
const X86Subtarget &Subtarget) {
EVT SrcVT = Src.getValueType();
- if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
+ if (Subtarget.useSoftFloat() || !SrcVT.isSimple() ||
+ SrcVT.getScalarType() != MVT::i1)
return SDValue();
// Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 26e17cc..b9b5b58 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -2287,6 +2287,35 @@ bool GVNPass::processLoad(LoadInst *L) {
return true;
}
+// Attempt to process masked loads which have loaded from
+// masked stores with the same mask
+bool GVNPass::processMaskedLoad(IntrinsicInst *I) {
+ if (!MD)
+ return false;
+ MemDepResult Dep = MD->getDependency(I);
+ Instruction *DepInst = Dep.getInst();
+ if (!DepInst || !Dep.isLocal() || !Dep.isDef())
+ return false;
+
+ Value *Mask = I->getOperand(2);
+ Value *Passthrough = I->getOperand(3);
+ Value *StoreVal;
+ if (!match(DepInst, m_MaskedStore(m_Value(StoreVal), m_Value(), m_Value(),
+ m_Specific(Mask))) ||
+ StoreVal->getType() != I->getType())
+ return false;
+
+ // Remove the load but generate a select for the passthrough
+ Value *OpToForward = llvm::SelectInst::Create(Mask, StoreVal, Passthrough, "",
+ I->getIterator());
+
+ ICF->removeUsersOf(I);
+ I->replaceAllUsesWith(OpToForward);
+ salvageAndRemoveInstruction(I);
+ ++NumGVNLoad;
+ return true;
+}
+
/// Return a pair the first field showing the value number of \p Exp and the
/// second field showing whether it is a value number newly created.
std::pair<uint32_t, bool>
@@ -2734,6 +2763,10 @@ bool GVNPass::processInstruction(Instruction *I) {
return false;
}
+ if (match(I, m_Intrinsic<Intrinsic::masked_load>()) &&
+ processMaskedLoad(cast<IntrinsicInst>(I)))
+ return true;
+
// For conditional branches, we can perform simple conditional propagation on
// the condition value itself.
if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7750687..cb6bfb2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8694,7 +8694,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
Plan->addVF(VF);
if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
- Plan,
+ *Plan,
[this](PHINode *P) {
return Legal->getIntOrFpInductionDescriptor(P);
},
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f76777b..ca63bf3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -45,13 +45,13 @@ static cl::opt<bool> EnableWideActiveLaneMask(
cl::desc("Enable use of wide get active lane mask instructions"));
bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
- VPlanPtr &Plan,
+ VPlan &Plan,
function_ref<const InductionDescriptor *(PHINode *)>
GetIntOrFpInductionDescriptor,
const TargetLibraryInfo &TLI) {
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
- Plan->getVectorLoopRegion());
+ Plan.getVectorLoopRegion());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
// Skip blocks outside region
if (!VPBB->getParent())
@@ -77,11 +77,11 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
for (VPValue *Op : PhiR->operands())
NewRecipe->addOperand(Op);
} else {
- VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue());
+ VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
VPValue *Step =
- vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep());
+ vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep());
NewRecipe = new VPWidenIntOrFpInductionRecipe(
- Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc());
+ Phi, Start, Step, &Plan.getVF(), *II, Ingredient.getDebugLoc());
}
} else {
assert(isa<VPInstruction>(&Ingredient) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 4c65cb7..2f00e51 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -138,7 +138,7 @@ struct VPlanTransforms {
/// widen recipes. Returns false if any VPInstructions could not be converted
/// to a wide recipe if needed.
LLVM_ABI_FOR_TEST static bool tryToConvertVPInstructionsToVPRecipes(
- VPlanPtr &Plan,
+ VPlan &Plan,
function_ref<const InductionDescriptor *(PHINode *)>
GetIntOrFpInductionDescriptor,
const TargetLibraryInfo &TLI);
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 6f98eae..8399292 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -507,14 +507,10 @@ if(build_runtimes)
endif()
# Forward user-provived system configuration to runtimes for requirement introspection.
- # CMAKE_PREFIX_PATH is the search path for CMake packages. In order to pass through
- # the command line interface, the CMake semicolon separator needs to be replaced
- # with $<SEMICOLON>
+ # CMAKE_PREFIX_PATH is the search path for CMake packages.
if(CMAKE_PREFIX_PATH)
- string(JOIN "$<SEMICOLON>" escaped_cmake_prefix_path ${CMAKE_PREFIX_PATH})
- list(APPEND extra_cmake_args "-DCMAKE_PREFIX_PATH=${escaped_cmake_prefix_path}")
+ list(APPEND extra_cmake_args "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}")
endif()
-
# CMAKE_PROGRAM_PATH is the search path for executables such as python.
if(CMAKE_PROGRAM_PATH)
list(APPEND extra_cmake_args "-DCMAKE_PROGRAM_PATH=${CMAKE_PROGRAM_PATH}")
diff --git a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
index 1de8d0a..01e3d3a 100644
--- a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
+++ b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
@@ -68,13 +68,12 @@ entry:
}
; SVE calling conventions
-; Predicate register spills end up in FP region, currently. This can be
-; mitigated with the -aarch64-enable-zpr-predicate-spills option.
+; Padding is placed between predicate and fpr/zpr register spills, so only emit remarks when hazard padding is off.
+; Note: The -aarch64-enable-zpr-predicate-spills option is deprecated (and will be removed soon).
define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) #2 {
; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at [SP-64-258 * vscale] is too close to FPR stack object at [SP-64-256 * vscale]
; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64]
-; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at [SP-1088-258 * vscale] is too close to FPR stack object at [SP-1088-256 * vscale]
; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'svecc_call':
; CHECK-ZPR-PRED-SPILLS-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object
; CHECK-ZPR-PRED-SPILLS: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64]
@@ -89,7 +88,6 @@ entry:
define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) #2 {
; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-64-258 * vscale] is too close to FPR stack object at [SP-64-256 * vscale]
; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64]
-; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-1088-258 * vscale] is too close to FPR stack object at [SP-1088-256 * vscale]
; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call':
; CHECK-ZPR-PRED-SPILLS-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object
; CHECK-ZPR-PRED-SPILLS: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64]
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index 333a8be..bdee359 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=0 | FileCheck %s --check-prefixes=CHECK,CHECK0
; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=64 | FileCheck %s --check-prefixes=CHECK,CHECK64
-; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-NOSPLITSVE
-; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-split-sve-objects -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-SPLITSVE
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-split-sve-objects=false -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-NOSPLITSVE
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-SPLITSVE
define i32 @basic(i32 noundef %num) {
; CHECK-LABEL: basic:
@@ -1940,23 +1940,22 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
;
; CHECK64-LABEL: svecc_call:
; CHECK64: // %bb.0: // %entry
-; CHECK64-NEXT: sub sp, sp, #128
-; CHECK64-NEXT: .cfi_def_cfa_offset 128
+; CHECK64-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK64-NEXT: .cfi_def_cfa_offset 64
; CHECK64-NEXT: cntd x9
-; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT: stp x27, x26, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT: str x19, [sp, #112] // 8-byte Folded Spill
-; CHECK64-NEXT: add x29, sp, #64
+; CHECK64-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK64-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK64-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK64-NEXT: mov x29, sp
; CHECK64-NEXT: .cfi_def_cfa w29, 64
-; CHECK64-NEXT: .cfi_offset w19, -16
-; CHECK64-NEXT: .cfi_offset w26, -24
-; CHECK64-NEXT: .cfi_offset w27, -32
-; CHECK64-NEXT: .cfi_offset w28, -40
+; CHECK64-NEXT: .cfi_offset w19, -8
+; CHECK64-NEXT: .cfi_offset w26, -16
+; CHECK64-NEXT: .cfi_offset w27, -24
+; CHECK64-NEXT: .cfi_offset w28, -32
; CHECK64-NEXT: .cfi_offset vg, -48
; CHECK64-NEXT: .cfi_offset w30, -56
; CHECK64-NEXT: .cfi_offset w29, -64
-; CHECK64-NEXT: addvl sp, sp, #-18
+; CHECK64-NEXT: addvl sp, sp, #-2
; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -1969,30 +1968,32 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT: sub sp, sp, #64
+; CHECK64-NEXT: addvl sp, sp, #-16
+; CHECK64-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK64-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128
; CHECK64-NEXT: sub sp, sp, #64
; CHECK64-NEXT: mov x8, x0
; CHECK64-NEXT: bl __arm_sme_state
@@ -2014,22 +2015,32 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
; CHECK64-NEXT: mov w0, #22647 // =0x5877
; CHECK64-NEXT: movk w0, #59491, lsl #16
; CHECK64-NEXT: add sp, sp, #64
-; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: add sp, sp, #64
+; CHECK64-NEXT: addvl sp, sp, #16
+; CHECK64-NEXT: .cfi_restore z8
+; CHECK64-NEXT: .cfi_restore z9
+; CHECK64-NEXT: .cfi_restore z10
+; CHECK64-NEXT: .cfi_restore z11
+; CHECK64-NEXT: .cfi_restore z12
+; CHECK64-NEXT: .cfi_restore z13
+; CHECK64-NEXT: .cfi_restore z14
+; CHECK64-NEXT: .cfi_restore z15
; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
@@ -2042,20 +2053,11 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT: addvl sp, sp, #18
-; CHECK64-NEXT: .cfi_restore z8
-; CHECK64-NEXT: .cfi_restore z9
-; CHECK64-NEXT: .cfi_restore z10
-; CHECK64-NEXT: .cfi_restore z11
-; CHECK64-NEXT: .cfi_restore z12
-; CHECK64-NEXT: .cfi_restore z13
-; CHECK64-NEXT: .cfi_restore z14
-; CHECK64-NEXT: .cfi_restore z15
-; CHECK64-NEXT: .cfi_def_cfa wsp, 128
-; CHECK64-NEXT: ldp x26, x19, [sp, #104] // 16-byte Folded Reload
-; CHECK64-NEXT: ldp x28, x27, [sp, #88] // 16-byte Folded Reload
-; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT: add sp, sp, #128
+; CHECK64-NEXT: addvl sp, sp, #2
+; CHECK64-NEXT: .cfi_def_cfa wsp, 64
+; CHECK64-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK64-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK64-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
; CHECK64-NEXT: .cfi_def_cfa_offset 0
; CHECK64-NEXT: .cfi_restore w19
; CHECK64-NEXT: .cfi_restore w26
@@ -2463,23 +2465,22 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
;
; CHECK64-LABEL: svecc_alloca_call:
; CHECK64: // %bb.0: // %entry
-; CHECK64-NEXT: sub sp, sp, #128
-; CHECK64-NEXT: .cfi_def_cfa_offset 128
+; CHECK64-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK64-NEXT: .cfi_def_cfa_offset 64
; CHECK64-NEXT: cntd x9
-; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT: stp x27, x26, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT: str x19, [sp, #112] // 8-byte Folded Spill
-; CHECK64-NEXT: add x29, sp, #64
+; CHECK64-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK64-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK64-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK64-NEXT: mov x29, sp
; CHECK64-NEXT: .cfi_def_cfa w29, 64
-; CHECK64-NEXT: .cfi_offset w19, -16
-; CHECK64-NEXT: .cfi_offset w26, -24
-; CHECK64-NEXT: .cfi_offset w27, -32
-; CHECK64-NEXT: .cfi_offset w28, -40
+; CHECK64-NEXT: .cfi_offset w19, -8
+; CHECK64-NEXT: .cfi_offset w26, -16
+; CHECK64-NEXT: .cfi_offset w27, -24
+; CHECK64-NEXT: .cfi_offset w28, -32
; CHECK64-NEXT: .cfi_offset vg, -48
; CHECK64-NEXT: .cfi_offset w30, -56
; CHECK64-NEXT: .cfi_offset w29, -64
-; CHECK64-NEXT: addvl sp, sp, #-18
+; CHECK64-NEXT: addvl sp, sp, #-2
; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -2492,30 +2493,32 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT: sub sp, sp, #64
+; CHECK64-NEXT: addvl sp, sp, #-16
+; CHECK64-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK64-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128
; CHECK64-NEXT: sub sp, sp, #112
; CHECK64-NEXT: bl __arm_sme_state
; CHECK64-NEXT: mov x19, x0
@@ -2536,22 +2539,32 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
; CHECK64-NEXT: mov w0, #22647 // =0x5877
; CHECK64-NEXT: movk w0, #59491, lsl #16
; CHECK64-NEXT: add sp, sp, #112
-; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: add sp, sp, #64
+; CHECK64-NEXT: addvl sp, sp, #16
+; CHECK64-NEXT: .cfi_restore z8
+; CHECK64-NEXT: .cfi_restore z9
+; CHECK64-NEXT: .cfi_restore z10
+; CHECK64-NEXT: .cfi_restore z11
+; CHECK64-NEXT: .cfi_restore z12
+; CHECK64-NEXT: .cfi_restore z13
+; CHECK64-NEXT: .cfi_restore z14
+; CHECK64-NEXT: .cfi_restore z15
; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
@@ -2564,20 +2577,11 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT: addvl sp, sp, #18
-; CHECK64-NEXT: .cfi_restore z8
-; CHECK64-NEXT: .cfi_restore z9
-; CHECK64-NEXT: .cfi_restore z10
-; CHECK64-NEXT: .cfi_restore z11
-; CHECK64-NEXT: .cfi_restore z12
-; CHECK64-NEXT: .cfi_restore z13
-; CHECK64-NEXT: .cfi_restore z14
-; CHECK64-NEXT: .cfi_restore z15
-; CHECK64-NEXT: .cfi_def_cfa wsp, 128
-; CHECK64-NEXT: ldp x26, x19, [sp, #104] // 16-byte Folded Reload
-; CHECK64-NEXT: ldp x28, x27, [sp, #88] // 16-byte Folded Reload
-; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT: add sp, sp, #128
+; CHECK64-NEXT: addvl sp, sp, #2
+; CHECK64-NEXT: .cfi_def_cfa wsp, 64
+; CHECK64-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK64-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK64-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
; CHECK64-NEXT: .cfi_def_cfa_offset 0
; CHECK64-NEXT: .cfi_restore w19
; CHECK64-NEXT: .cfi_restore w26
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index 243f0ed..f8655a7 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -256,7 +256,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: .LBB5_3: ; %bb4
; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GCN-NEXT: s_wait_xcnt 0x0
; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 63
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
index 029aa39..ce1ea4d 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
@@ -128,13 +128,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128 = COPY %0
%2.sub2_sub3:areg_128 = COPY %1
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
SI_RETURN
...
@@ -153,13 +153,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128_align2 = COPY %0
%2.sub2_sub3:areg_128_align2 = COPY %1
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
SI_RETURN
...
@@ -398,14 +398,14 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
undef %1.sub0:areg_128 = COPY %0
%1.sub1:areg_128 = COPY %0
%1.sub2:areg_128 = COPY %0
%1.sub3:areg_128 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %1
SI_RETURN
...
@@ -425,14 +425,14 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
undef %1.sub0:areg_128_align2 = COPY %0
%1.sub1:areg_128_align2 = COPY %0
%1.sub2:areg_128_align2 = COPY %0
%1.sub3:areg_128_align2 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %1
SI_RETURN
...
@@ -641,13 +641,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
%0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
%2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
SI_RETURN
...
@@ -668,13 +668,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1
%0.sub1:vreg_128 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0
%2.sub2_sub3:areg_128_align2 = COPY %0.sub1
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
SI_RETURN
...
@@ -890,14 +890,14 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_64 = COPY $vgpr0
undef %1.sub0:areg_128 = COPY %0.sub0
%1.sub1:areg_128 = COPY %0.sub0
%1.sub2:areg_128 = COPY %0.sub0
%1.sub3:areg_128 = COPY %0.sub0
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %1
SI_RETURN
...
@@ -917,14 +917,14 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_64 = COPY $vgpr0
undef %1.sub0:areg_128_align2 = COPY %0.sub0
%1.sub1:areg_128_align2 = COPY %0.sub0
%1.sub2:areg_128_align2 = COPY %0.sub0
%1.sub3:areg_128_align2 = COPY %0.sub0
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %1
SI_RETURN
...
@@ -1051,13 +1051,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
%0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
%2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
SI_RETURN
...
@@ -1076,13 +1076,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1
%0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1
%2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
SI_RETURN
...
@@ -1358,11 +1358,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%2:areg_128 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
SI_RETURN
...
@@ -1379,11 +1379,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%2:areg_128_align2 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
SI_RETURN
...
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
index 92836d8..63db24a 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
@@ -486,7 +486,7 @@ body: |
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
- ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK-NEXT: S_ENDPGM 0
bb.0:
S_NOP 0, implicit-def $agpr0
@@ -516,7 +516,7 @@ body: |
S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
- INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
+ INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
S_ENDPGM 0
...
@@ -1368,7 +1368,7 @@ body: |
; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
@@ -1408,7 +1408,7 @@ body: |
undef %2.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
early-clobber %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %2, 0, 0, 0, implicit $mode, implicit $exec
early-clobber %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %4
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %4
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
S_BRANCH %bb.2
@@ -1726,7 +1726,7 @@ body: |
; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
@@ -1763,7 +1763,7 @@ body: |
undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
%0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
%4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %4
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %4
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
S_BRANCH %bb.2
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index 9cbdc38..5b3e486 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,16 +8,16 @@
define amdgpu_kernel void @s_input_output_i128() {
; GFX908-LABEL: name: s_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9633802 /* regdef:SGPR_128 */, def %13
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10682378 /* regdef:SGPR_128 */, def %13
; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %13
- ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9633801 /* reguse:SGPR_128 */, [[COPY]]
+ ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10682377 /* reguse:SGPR_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: s_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9633802 /* regdef:SGPR_128 */, def %11
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10682378 /* regdef:SGPR_128 */, def %11
; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11
- ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9633801 /* reguse:SGPR_128 */, [[COPY]]
+ ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10682377 /* reguse:SGPR_128 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=s"()
call void asm sideeffect "; use $0", "s"(i128 %val)
@@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() {
define amdgpu_kernel void @v_input_output_i128() {
; GFX908-LABEL: name: v_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def %13
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def %13
; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %13
- ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7798793 /* reguse:VReg_128 */, [[COPY]]
+ ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:VReg_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: v_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %11
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def %11
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11
- ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7995401 /* reguse:VReg_128_Align2 */, [[COPY]]
+ ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:VReg_128_Align2 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=v"()
call void asm sideeffect "; use $0", "v"(i128 %val)
@@ -47,16 +47,16 @@ define amdgpu_kernel void @a_input_output_i128() {
; GFX908-LABEL: name: a_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8323082 /* regdef:AReg_128 */, def %13
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8847370 /* regdef:AReg_128 */, def %13
; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %13
- ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY]]
+ ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: a_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8650762 /* regdef:AReg_128_Align2 */, def %11
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9568266 /* regdef:AReg_128_Align2 */, def %11
; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11
- ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY]]
+ ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = call i128 asm sideeffect "; def $0", "=a"()
call void asm sideeffect "; use $0", "a"(i128 %val)
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 6509d80..f88b1bf 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -12,7 +12,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX908-NEXT: {{ $}}
; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32
- ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def %25
+ ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def %25
; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %25
; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64 */, def %27
; REGALLOC-GFX908-NEXT: SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
@@ -37,7 +37,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0
- ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+ ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
@@ -61,7 +61,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX90A-NEXT: {{ $}}
; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32
- ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %23
+ ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def %23
; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %23
; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def %21
; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY %21
@@ -80,7 +80,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; PEI-GFX90A-NEXT: {{ $}}
; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0
- ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+ ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
index d7b713a..0b4e662 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
@@ -19,7 +19,7 @@ body: |
; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
@@ -30,7 +30,7 @@ body: |
%4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
undef %5.sub0_sub1:areg_128_align2 = COPY %4
%5.sub2_sub3 = IMPLICIT_DEF
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
@@ -172,7 +172,7 @@ body: |
; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
@@ -183,7 +183,7 @@ body: |
undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
undef %5.sub0_sub1:areg_128_align2 = COPY %4.sub2_sub3
%5.sub2_sub3 = IMPLICIT_DEF
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
@@ -208,7 +208,7 @@ body: |
; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2
; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
@@ -219,7 +219,7 @@ body: |
undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
undef %5.sub1:areg_128_align2 = COPY %4.sub2
%5.sub2_sub3 = IMPLICIT_DEF
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
index 57f611b..4c2ea2f 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
@@ -17,7 +17,7 @@ body: |
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
%0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -26,7 +26,7 @@ body: |
%3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
%4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
undef %5.sub0_sub1:areg_128_align2 = COPY %4
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
...
@@ -47,7 +47,7 @@ body: |
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
%0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -56,7 +56,7 @@ body: |
%3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
%4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
undef %5.sub0_sub1:areg_128_align2 = COPY %4
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
...
@@ -151,7 +151,7 @@ body: |
; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1
; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
%0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -163,7 +163,7 @@ body: |
%other_use:vreg_64_align2 = COPY %5.sub0_sub1
%6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
undef %8.sub0_sub1:areg_128_align2 = COPY %6
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %8:areg_128_align2
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %8:areg_128_align2
GLOBAL_STORE_DWORDX4 %0, %8, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
@@ -231,7 +231,7 @@ body: |
; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
%0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -245,7 +245,7 @@ body: |
%other_use1:vreg_64_align2 = COPY %4.sub2_sub3
%other_use2:vreg_64 = COPY %4.sub1_sub2
%6:areg_128_align2 = COPY %4
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %6:areg_128_align2
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %6:areg_128_align2
GLOBAL_STORE_DWORDX4 %0, %6, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
...
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index af8b9e7..6fe99d8 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -520,6 +520,7 @@ body: |
; GCN-NEXT: GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr4_vgpr5, 16, 0, implicit $exec
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: $sgpr2 = S_ADD_I32 $sgpr0, 100, implicit-def $scc
+ ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 20, implicit $exec
$sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
@@ -921,7 +922,6 @@ body: |
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
...
-# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
---
name: wait_kmcnt_with_outstanding_vmem
tracksRegLiveness: true
@@ -937,6 +937,7 @@ body: |
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
+ ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
@@ -944,7 +945,6 @@ body: |
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
...
-# FIXME: Missing S_WAIT_XCNT before overwriting sgpr0.
---
name: wait_loadcnt_with_outstanding_smem
tracksRegLiveness: true
@@ -960,6 +960,7 @@ body: |
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GCN-NEXT: S_WAIT_LOADCNT 0
; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $sgpr0 = S_MOV_B32 0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
@@ -967,7 +968,6 @@ body: |
$sgpr0 = S_MOV_B32 0
...
-# TODO: Unnecessary wait before overwriting vgpr0.
---
name: overwrite_vgpr_after_smem
tracksRegLiveness: true
@@ -981,14 +981,12 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
- ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
...
-# TODO: Unnecessary wait before overwriting sgpr0.
---
name: overwrite_sgpr_after_vmem
tracksRegLiveness: true
@@ -1002,7 +1000,6 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
- ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $sgpr0 = S_MOV_B32 0
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
diff --git a/llvm/test/CodeGen/ARM/and-mask-variable.ll b/llvm/test/CodeGen/ARM/and-mask-variable.ll
new file mode 100644
index 0000000..0f84b76
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/and-mask-variable.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s --check-prefix V7M
+; RUN: llc -mtriple=armv7a-eabi %s -o - | FileCheck %s --check-prefix V7A
+; RUN: llc -mtriple=thumbv7a-eabi %s -o - | FileCheck %s --check-prefix V7A-T
+; RUN: llc -mtriple=armv6m-eabi %s -o - | FileCheck %s --check-prefix V6M
+
+define i32 @mask_pair(i32 %x, i32 %y) {
+; V7M-LABEL: mask_pair:
+; V7M: @ %bb.0:
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: mask_pair:
+; V7A: @ %bb.0:
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: mask_pair:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: mask_pair:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: bx lr
+ %shl = shl nsw i32 -1, %y
+ %and = and i32 %shl, %x
+ ret i32 %and
+}
+
+define i64 @mask_pair_64(i64 %x, i64 %y) {
+; V7M-LABEL: mask_pair_64:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsl.w r12, r3, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl.w r12, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl r3, r2
+; V7M-NEXT: and.w r0, r0, r12
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: mask_pair_64:
+; V7A: @ %bb.0:
+; V7A-NEXT: subs r12, r2, #32
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsl r2, r3, r2
+; V7A-NEXT: lslpl r3, r3, r12
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: and r1, r3, r1
+; V7A-NEXT: and r0, r2, r0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: mask_pair_64:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsl.w r12, r3, r2
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl.w r12, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl r3, r2
+; V7A-T-NEXT: and.w r0, r0, r12
+; V7A-T-NEXT: ands r1, r3
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: mask_pair_64:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %shl = shl nsw i64 -1, %y
+ %and = and i64 %shl, %x
+ ret i64 %and
+}
diff --git a/llvm/test/CodeGen/ARM/extract-bits.ll b/llvm/test/CodeGen/ARM/extract-bits.ll
new file mode 100644
index 0000000..77deaa5
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/extract-bits.ll
@@ -0,0 +1,4591 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s --check-prefix V7M
+; RUN: llc -mtriple=armv7a-eabi %s -o - | FileCheck %s --check-prefix V7A
+; RUN: llc -mtriple=thumbv7a-eabi %s -o - | FileCheck %s --check-prefix V7A-T
+; RUN: llc -mtriple=armv6m-eabi %s -o - | FileCheck %s --check-prefix V6M
+
+; Patterns:
+; a) (x >> start) & (1 << nbits) - 1
+; b) (x >> start) & ~(-1 << nbits)
+; c) (x >> start) & (-1 >> (32 - y))
+; d) (x >> start) << (32 - y) >> (32 - y)
+; are equivalent.
+
+; ---------------------------------------------------------------------------- ;
+; Pattern a. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a0:
+; V7M: @ %bb.0:
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: movs r1, #1
+; V7M-NEXT: lsls r1, r2
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_a0:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r2, r3, r12, lsl r2
+; V7A-NEXT: and r0, r2, r0, lsr r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_a0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsls r1, r2
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_a0:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #1
+; V6M-NEXT: lsls r1, r2
+; V6M-NEXT: subs r1, r1, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %shifted = lshr i32 %val, %numskipbits
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a0_arithmetic:
+; V7M: @ %bb.0:
+; V7M-NEXT: asrs r0, r1
+; V7M-NEXT: movs r1, #1
+; V7M-NEXT: lsls r1, r2
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_a0_arithmetic:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r2, r3, r12, lsl r2
+; V7A-NEXT: and r0, r2, r0, asr r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_a0_arithmetic:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: asrs r0, r1
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsls r1, r2
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_a0_arithmetic:
+; V6M: @ %bb.0:
+; V6M-NEXT: asrs r0, r1
+; V6M-NEXT: movs r1, #1
+; V6M-NEXT: lsls r1, r2
+; V6M-NEXT: subs r1, r1, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %shifted = ashr i32 %val, %numskipbits
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: movs r1, #1
+; V7M-NEXT: lsls r1, r2
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_a1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r2, r3, r12, lsl r2
+; V7A-NEXT: and r0, r2, r0, lsr r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_a1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsls r1, r2
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_a1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #1
+; V6M-NEXT: lsls r1, r2
+; V6M-NEXT: subs r1, r1, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %skip = zext i8 %numskipbits to i32
+ %shifted = lshr i32 %val, %skip
+ %conv = zext i8 %numlowbits to i32
+ %onebit = shl i32 1, %conv
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_a2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: movs r1, #1
+; V7M-NEXT: lsls r1, r2
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_a2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r2, r3, r12, lsl r2
+; V7A-NEXT: and r0, r2, r0, lsr r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_a2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsls r1, r2
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_a2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r3, [r0]
+; V6M-NEXT: lsrs r3, r1
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: lsls r0, r2
+; V6M-NEXT: subs r0, r0, #1
+; V6M-NEXT: ands r0, r3
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %shifted = lshr i32 %val, %numskipbits
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: movs r1, #1
+; V7M-NEXT: lsls r1, r2
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_a3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r2, r3, r12, lsl r2
+; V7A-NEXT: and r0, r2, r0, lsr r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_a3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsls r1, r2
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_a3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r3, [r0]
+; V6M-NEXT: lsrs r3, r1
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: lsls r0, r2
+; V6M-NEXT: subs r0, r0, #1
+; V6M-NEXT: ands r0, r3
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %skip = zext i8 %numskipbits to i32
+ %shifted = lshr i32 %val, %skip
+ %conv = zext i8 %numlowbits to i32
+ %onebit = shl i32 1, %conv
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: movs r1, #1
+; V7M-NEXT: lsls r1, r2
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_a4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r2, r3, r12, lsl r2
+; V7A-NEXT: and r0, r2, r0, lsr r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_a4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsls r1, r2
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_a4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #1
+; V6M-NEXT: lsls r1, r2
+; V6M-NEXT: subs r1, r1, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %shifted = lshr i32 %val, %numskipbits
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %shifted, %mask ; swapped order
+ ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a0:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: mov.w lr, #1
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: rsb.w r4, r12, #32
+; V7M-NEXT: subs.w r3, r12, #32
+; V7M-NEXT: lsr.w r4, lr, r4
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r4, lr, r3
+; V7M-NEXT: lsl.w r3, lr, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: subs r3, #1
+; V7M-NEXT: sbc r12, r4, #0
+; V7M-NEXT: rsb.w r4, r2, #32
+; V7M-NEXT: lsl.w r4, r1, r4
+; V7M-NEXT: orrs r0, r4
+; V7M-NEXT: subs.w r4, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r4
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: and.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: and.w r1, r1, r12
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_a0:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, lr}
+; V7A-NEXT: push {r4, lr}
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: mov lr, #1
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: rsb r3, r12, #32
+; V7A-NEXT: subs r4, r12, #32
+; V7A-NEXT: lsr r3, lr, r3
+; V7A-NEXT: lslpl r3, lr, r4
+; V7A-NEXT: lsl r4, lr, r12
+; V7A-NEXT: movwpl r4, #0
+; V7A-NEXT: subs r4, r4, #1
+; V7A-NEXT: sbc r12, r3, #0
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsrpl r0, r1, r3
+; V7A-NEXT: lsr r1, r1, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: and r0, r4, r0
+; V7A-NEXT: and r1, r12, r1
+; V7A-NEXT: pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_a0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: mov.w lr, #1
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: rsb.w r4, r12, #32
+; V7A-T-NEXT: subs.w r3, r12, #32
+; V7A-T-NEXT: lsr.w r4, lr, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r4, lr, r3
+; V7A-T-NEXT: lsl.w r3, lr, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r3, #0
+; V7A-T-NEXT: subs r3, #1
+; V7A-T-NEXT: sbc r12, r4, #0
+; V7A-T-NEXT: rsb.w r4, r2, #32
+; V7A-T-NEXT: lsl.w r4, r1, r4
+; V7A-T-NEXT: orrs r0, r4
+; V7A-T-NEXT: subs.w r4, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r4
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: and.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: and.w r1, r1, r12
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, r7, lr}
+; V6M-NEXT: push {r4, r5, r6, r7, lr}
+; V6M-NEXT: .pad #12
+; V6M-NEXT: sub sp, #12
+; V6M-NEXT: str r2, [sp, #8] @ 4-byte Spill
+; V6M-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; V6M-NEXT: mov r6, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r7, #0
+; V6M-NEXT: ldr r2, [sp, #32]
+; V6M-NEXT: mov r1, r7
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: subs r5, r0, #1
+; V6M-NEXT: sbcs r4, r7
+; V6M-NEXT: mov r0, r6
+; V6M-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; V6M-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: add sp, #12
+; V6M-NEXT: pop {r4, r5, r6, r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %onebit = shl i64 1, %numlowbits
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a0_arithmetic:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: mov.w lr, #1
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: rsb.w r4, r12, #32
+; V7M-NEXT: subs.w r3, r12, #32
+; V7M-NEXT: lsr.w r4, lr, r4
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r4, lr, r3
+; V7M-NEXT: lsl.w r3, lr, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: subs r3, #1
+; V7M-NEXT: sbc r12, r4, #0
+; V7M-NEXT: rsb.w r4, r2, #32
+; V7M-NEXT: lsl.w r4, r1, r4
+; V7M-NEXT: orrs r0, r4
+; V7M-NEXT: subs.w r4, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: asrpl.w r0, r1, r4
+; V7M-NEXT: asr.w r2, r1, r2
+; V7M-NEXT: and.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: asrpl r2, r1, #31
+; V7M-NEXT: and.w r1, r12, r2
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_a0_arithmetic:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, lr}
+; V7A-NEXT: push {r4, lr}
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: mov lr, #1
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: rsb r3, r12, #32
+; V7A-NEXT: subs r4, r12, #32
+; V7A-NEXT: lsr r3, lr, r3
+; V7A-NEXT: lslpl r3, lr, r4
+; V7A-NEXT: lsl r4, lr, r12
+; V7A-NEXT: movwpl r4, #0
+; V7A-NEXT: subs r4, r4, #1
+; V7A-NEXT: sbc r12, r3, #0
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: asr r2, r1, r2
+; V7A-NEXT: asrpl r0, r1, r3
+; V7A-NEXT: asrpl r2, r1, #31
+; V7A-NEXT: and r0, r4, r0
+; V7A-NEXT: and r1, r12, r2
+; V7A-NEXT: pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_a0_arithmetic:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: mov.w lr, #1
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: rsb.w r4, r12, #32
+; V7A-T-NEXT: subs.w r3, r12, #32
+; V7A-T-NEXT: lsr.w r4, lr, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r4, lr, r3
+; V7A-T-NEXT: lsl.w r3, lr, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r3, #0
+; V7A-T-NEXT: subs r3, #1
+; V7A-T-NEXT: sbc r12, r4, #0
+; V7A-T-NEXT: rsb.w r4, r2, #32
+; V7A-T-NEXT: lsl.w r4, r1, r4
+; V7A-T-NEXT: orrs r0, r4
+; V7A-T-NEXT: subs.w r4, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: asrpl.w r0, r1, r4
+; V7A-T-NEXT: asr.w r2, r1, r2
+; V7A-T-NEXT: and.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: asrpl r2, r1, #31
+; V7A-T-NEXT: and.w r1, r12, r2
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a0_arithmetic:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, r7, lr}
+; V6M-NEXT: push {r4, r5, r6, r7, lr}
+; V6M-NEXT: .pad #12
+; V6M-NEXT: sub sp, #12
+; V6M-NEXT: str r2, [sp, #8] @ 4-byte Spill
+; V6M-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; V6M-NEXT: mov r6, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r7, #0
+; V6M-NEXT: ldr r2, [sp, #32]
+; V6M-NEXT: mov r1, r7
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: subs r5, r0, #1
+; V6M-NEXT: sbcs r4, r7
+; V6M-NEXT: mov r0, r6
+; V6M-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; V6M-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
+; V6M-NEXT: bl __aeabi_lasr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: add sp, #12
+; V6M-NEXT: pop {r4, r5, r6, r7, pc}
+ %shifted = ashr i64 %val, %numskipbits
+ %onebit = shl i64 1, %numlowbits
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: rsb.w r4, r3, #32
+; V7M-NEXT: mov.w lr, #1
+; V7M-NEXT: subs.w r12, r3, #32
+; V7M-NEXT: lsl.w r3, lr, r3
+; V7M-NEXT: lsr.w r4, lr, r4
+; V7M-NEXT: lsr.w r0, r0, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r4, lr, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: subs r3, #1
+; V7M-NEXT: sbc r12, r4, #0
+; V7M-NEXT: rsb.w r4, r2, #32
+; V7M-NEXT: lsl.w r4, r1, r4
+; V7M-NEXT: orrs r0, r4
+; V7M-NEXT: subs.w r4, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r4
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: and.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: and.w r1, r1, r12
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_a1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, lr}
+; V7A-NEXT: push {r4, lr}
+; V7A-NEXT: rsb r12, r3, #32
+; V7A-NEXT: mov lr, #1
+; V7A-NEXT: subs r4, r3, #32
+; V7A-NEXT: lsl r3, lr, r3
+; V7A-NEXT: lsr r12, lr, r12
+; V7A-NEXT: movwpl r3, #0
+; V7A-NEXT: lslpl r12, lr, r4
+; V7A-NEXT: rsb r4, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: subs r3, r3, #1
+; V7A-NEXT: sbc r12, r12, #0
+; V7A-NEXT: orr r0, r0, r1, lsl r4
+; V7A-NEXT: subs r4, r2, #32
+; V7A-NEXT: lsrpl r0, r1, r4
+; V7A-NEXT: lsr r1, r1, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: and r0, r3, r0
+; V7A-NEXT: and r1, r12, r1
+; V7A-NEXT: pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_a1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: rsb.w r4, r3, #32
+; V7A-T-NEXT: mov.w lr, #1
+; V7A-T-NEXT: subs.w r12, r3, #32
+; V7A-T-NEXT: lsl.w r3, lr, r3
+; V7A-T-NEXT: lsr.w r4, lr, r4
+; V7A-T-NEXT: lsr.w r0, r0, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r4, lr, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r3, #0
+; V7A-T-NEXT: subs r3, #1
+; V7A-T-NEXT: sbc r12, r4, #0
+; V7A-T-NEXT: rsb.w r4, r2, #32
+; V7A-T-NEXT: lsl.w r4, r1, r4
+; V7A-T-NEXT: orrs r0, r4
+; V7A-T-NEXT: subs.w r4, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r4
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: and.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: and.w r1, r1, r12
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, r7, lr}
+; V6M-NEXT: push {r4, r5, r6, r7, lr}
+; V6M-NEXT: .pad #12
+; V6M-NEXT: sub sp, #12
+; V6M-NEXT: str r2, [sp, #8] @ 4-byte Spill
+; V6M-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; V6M-NEXT: mov r6, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r7, #0
+; V6M-NEXT: mov r1, r7
+; V6M-NEXT: mov r2, r3
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: subs r5, r0, #1
+; V6M-NEXT: sbcs r4, r7
+; V6M-NEXT: mov r0, r6
+; V6M-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; V6M-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: add sp, #12
+; V6M-NEXT: pop {r4, r5, r6, r7, pc}
+ %skip = zext i8 %numskipbits to i64
+ %shifted = lshr i64 %val, %skip
+ %conv = zext i8 %numlowbits to i64
+ %onebit = shl i64 1, %conv
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: mov.w lr, #1
+; V7M-NEXT: rsb.w r1, r12, #32
+; V7M-NEXT: subs.w r3, r12, #32
+; V7M-NEXT: lsr.w r1, lr, r1
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, lr, r3
+; V7M-NEXT: lsl.w r3, lr, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: subs.w lr, r3, #1
+; V7M-NEXT: ldrd r0, r3, [r0]
+; V7M-NEXT: sbc r12, r1, #0
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: lsl.w r1, r3, r1
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: orrs r0, r1
+; V7M-NEXT: subs.w r1, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r3, r1
+; V7M-NEXT: lsr.w r1, r3, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: and.w r0, r0, lr
+; V7M-NEXT: and.w r1, r1, r12
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bextr64_a2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r5, r6, lr}
+; V7A-NEXT: push {r4, r5, r6, lr}
+; V7A-NEXT: ldr r1, [sp, #16]
+; V7A-NEXT: mov r3, #1
+; V7A-NEXT: ldr r6, [r0]
+; V7A-NEXT: ldr r5, [r0, #4]
+; V7A-NEXT: rsb r0, r1, #32
+; V7A-NEXT: subs r4, r1, #32
+; V7A-NEXT: lsl r1, r3, r1
+; V7A-NEXT: lsr r0, r3, r0
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: lslpl r0, r3, r4
+; V7A-NEXT: subs r1, r1, #1
+; V7A-NEXT: sbc r3, r0, #0
+; V7A-NEXT: lsr r0, r6, r2
+; V7A-NEXT: rsb r6, r2, #32
+; V7A-NEXT: orr r0, r0, r5, lsl r6
+; V7A-NEXT: subs r6, r2, #32
+; V7A-NEXT: lsrpl r0, r5, r6
+; V7A-NEXT: and r0, r1, r0
+; V7A-NEXT: lsr r1, r5, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: and r1, r3, r1
+; V7A-NEXT: pop {r4, r5, r6, pc}
+;
+; V7A-T-LABEL: bextr64_a2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: movs r3, #1
+; V7A-T-NEXT: ldrd lr, r1, [r0]
+; V7A-T-NEXT: rsb.w r4, r12, #32
+; V7A-T-NEXT: subs.w r0, r12, #32
+; V7A-T-NEXT: lsr.w r4, r3, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r4, r3, r0
+; V7A-T-NEXT: lsl.w r0, r3, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsr.w r3, lr, r2
+; V7A-T-NEXT: subs r0, #1
+; V7A-T-NEXT: sbc r12, r4, #0
+; V7A-T-NEXT: rsb.w r4, r2, #32
+; V7A-T-NEXT: lsl.w r4, r1, r4
+; V7A-T-NEXT: orrs r3, r4
+; V7A-T-NEXT: subs.w r4, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r3, r1, r4
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: and.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: and.w r1, r1, r12
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, r7, lr}
+; V6M-NEXT: push {r4, r5, r6, r7, lr}
+; V6M-NEXT: .pad #4
+; V6M-NEXT: sub sp, #4
+; V6M-NEXT: str r2, [sp] @ 4-byte Spill
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r7, #0
+; V6M-NEXT: ldr r2, [sp, #24]
+; V6M-NEXT: mov r1, r7
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r6, r1
+; V6M-NEXT: subs r4, r0, #1
+; V6M-NEXT: sbcs r6, r7
+; V6M-NEXT: ldm r5!, {r0, r1}
+; V6M-NEXT: ldr r2, [sp] @ 4-byte Reload
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r4
+; V6M-NEXT: ands r1, r6
+; V6M-NEXT: add sp, #4
+; V6M-NEXT: pop {r4, r5, r6, r7, pc}
+ %val = load i64, ptr %w
+ %shifted = lshr i64 %val, %numskipbits
+ %onebit = shl i64 1, %numlowbits
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: mov.w r12, #1
+; V7M-NEXT: subs.w lr, r2, #32
+; V7M-NEXT: lsl.w r2, r12, r2
+; V7M-NEXT: lsr.w r3, r12, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r3, r12, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: subs.w lr, r2, #1
+; V7M-NEXT: ldrd r0, r2, [r0]
+; V7M-NEXT: sbc r12, r3, #0
+; V7M-NEXT: rsb.w r3, r1, #32
+; V7M-NEXT: lsl.w r3, r2, r3
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: orrs r0, r3
+; V7M-NEXT: subs.w r3, r1, #32
+; V7M-NEXT: lsr.w r1, r2, r1
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r2, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: and.w r0, r0, lr
+; V7M-NEXT: and.w r1, r1, r12
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bextr64_a3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r5, r6, lr}
+; V7A-NEXT: push {r4, r5, r6, lr}
+; V7A-NEXT: ldr r6, [r0]
+; V7A-NEXT: mov r3, #1
+; V7A-NEXT: ldr r5, [r0, #4]
+; V7A-NEXT: rsb r0, r2, #32
+; V7A-NEXT: subs r4, r2, #32
+; V7A-NEXT: lsl r2, r3, r2
+; V7A-NEXT: lsr r0, r3, r0
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: lslpl r0, r3, r4
+; V7A-NEXT: subs r3, r2, #1
+; V7A-NEXT: sbc r0, r0, #0
+; V7A-NEXT: lsr r2, r5, r1
+; V7A-NEXT: subs r4, r1, #32
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: and r2, r0, r2
+; V7A-NEXT: lsr r0, r6, r1
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: orr r0, r0, r5, lsl r1
+; V7A-NEXT: mov r1, r2
+; V7A-NEXT: lsrpl r0, r5, r4
+; V7A-NEXT: and r0, r3, r0
+; V7A-NEXT: pop {r4, r5, r6, pc}
+;
+; V7A-T-LABEL: bextr64_a3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: rsb.w r4, r2, #32
+; V7A-T-NEXT: mov.w lr, #1
+; V7A-T-NEXT: subs.w r3, r2, #32
+; V7A-T-NEXT: lsl.w r2, lr, r2
+; V7A-T-NEXT: lsr.w r4, lr, r4
+; V7A-T-NEXT: ldrd r12, r0, [r0]
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r4, lr, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: subs.w lr, r2, #1
+; V7A-T-NEXT: sbc r2, r4, #0
+; V7A-T-NEXT: lsr.w r4, r0, r1
+; V7A-T-NEXT: subs.w r3, r1, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r4, #0
+; V7A-T-NEXT: and.w r2, r2, r4
+; V7A-T-NEXT: rsb.w r4, r1, #32
+; V7A-T-NEXT: lsr.w r1, r12, r1
+; V7A-T-NEXT: lsl.w r4, r0, r4
+; V7A-T-NEXT: orr.w r1, r1, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r1, r0, r3
+; V7A-T-NEXT: and.w r0, lr, r1
+; V7A-T-NEXT: mov r1, r2
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, r7, lr}
+; V6M-NEXT: push {r4, r5, r6, r7, lr}
+; V6M-NEXT: .pad #4
+; V6M-NEXT: sub sp, #4
+; V6M-NEXT: str r1, [sp] @ 4-byte Spill
+; V6M-NEXT: mov r6, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r7, #0
+; V6M-NEXT: mov r1, r7
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r5, r1
+; V6M-NEXT: subs r4, r0, #1
+; V6M-NEXT: sbcs r5, r7
+; V6M-NEXT: ldm r6!, {r0, r1}
+; V6M-NEXT: ldr r2, [sp] @ 4-byte Reload
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r4
+; V6M-NEXT: ands r1, r5
+; V6M-NEXT: add sp, #4
+; V6M-NEXT: pop {r4, r5, r6, r7, pc}
+ %val = load i64, ptr %w
+ %skip = zext i8 %numskipbits to i64
+ %shifted = lshr i64 %val, %skip
+ %conv = zext i8 %numlowbits to i64
+ %onebit = shl i64 1, %conv
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: mov.w lr, #1
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: rsb.w r4, r12, #32
+; V7M-NEXT: subs.w r3, r12, #32
+; V7M-NEXT: lsr.w r4, lr, r4
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r4, lr, r3
+; V7M-NEXT: lsl.w r3, lr, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: subs r3, #1
+; V7M-NEXT: sbc r12, r4, #0
+; V7M-NEXT: rsb.w r4, r2, #32
+; V7M-NEXT: lsl.w r4, r1, r4
+; V7M-NEXT: orrs r0, r4
+; V7M-NEXT: subs.w r4, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r4
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: and.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: and.w r1, r1, r12
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_a4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, lr}
+; V7A-NEXT: push {r4, lr}
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: mov lr, #1
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: rsb r3, r12, #32
+; V7A-NEXT: subs r4, r12, #32
+; V7A-NEXT: lsr r3, lr, r3
+; V7A-NEXT: lslpl r3, lr, r4
+; V7A-NEXT: lsl r4, lr, r12
+; V7A-NEXT: movwpl r4, #0
+; V7A-NEXT: subs r4, r4, #1
+; V7A-NEXT: sbc r12, r3, #0
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsrpl r0, r1, r3
+; V7A-NEXT: lsr r1, r1, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: and r0, r0, r4
+; V7A-NEXT: and r1, r1, r12
+; V7A-NEXT: pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_a4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: mov.w lr, #1
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: rsb.w r4, r12, #32
+; V7A-T-NEXT: subs.w r3, r12, #32
+; V7A-T-NEXT: lsr.w r4, lr, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r4, lr, r3
+; V7A-T-NEXT: lsl.w r3, lr, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r3, #0
+; V7A-T-NEXT: subs r3, #1
+; V7A-T-NEXT: sbc r12, r4, #0
+; V7A-T-NEXT: rsb.w r4, r2, #32
+; V7A-T-NEXT: lsl.w r4, r1, r4
+; V7A-T-NEXT: orrs r0, r4
+; V7A-T-NEXT: subs.w r4, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r4
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: and.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: and.w r1, r1, r12
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, r7, lr}
+; V6M-NEXT: push {r4, r5, r6, r7, lr}
+; V6M-NEXT: .pad #12
+; V6M-NEXT: sub sp, #12
+; V6M-NEXT: str r2, [sp, #8] @ 4-byte Spill
+; V6M-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; V6M-NEXT: mov r6, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r7, #0
+; V6M-NEXT: ldr r2, [sp, #32]
+; V6M-NEXT: mov r1, r7
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: subs r5, r0, #1
+; V6M-NEXT: sbcs r4, r7
+; V6M-NEXT: mov r0, r6
+; V6M-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; V6M-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: add sp, #12
+; V6M-NEXT: pop {r4, r5, r6, r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %onebit = shl i64 1, %numlowbits
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %shifted, %mask ; swapped order
+ ret i64 %masked
+}
+
+; 64-bit, but with 32-bit output
+
+; Everything done in 64-bit, truncation happens last.
+define i32 @bextr64_32_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_a0:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldr r1, [sp]
+; V7M-NEXT: movs r2, #1
+; V7M-NEXT: lsls r2, r1
+; V7M-NEXT: subs r1, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: subs r1, r2, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_a0:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldr r12, [sp]
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: mov r1, #1
+; V7A-NEXT: lsl r1, r1, r12
+; V7A-NEXT: subs r2, r12, #32
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: sub r1, r1, #1
+; V7A-NEXT: and r0, r1, r0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_32_a0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsl.w r1, r1, r12
+; V7A-T-NEXT: subs.w r2, r12, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_a0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: ldr r2, [sp, #8]
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: subs r0, r0, #1
+; V6M-NEXT: ands r0, r4
+; V6M-NEXT: pop {r4, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %onebit = shl i64 1, %numlowbits
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %shifted
+ %res = trunc i64 %masked to i32
+ ret i32 %res
+}
+
+; Shifting happens in 64-bit, then truncation. Masking is 32-bit.
+define i32 @bextr64_32_a1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_a1:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldr r1, [sp]
+; V7M-NEXT: movs r2, #1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_a1:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: mov lr, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: add r12, r3, lr, lsl r12
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: and r0, r12, r0
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_32_a1:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsl.w r1, r1, r12
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_a1:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r7, lr}
+; V6M-NEXT: push {r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldr r1, [sp, #8]
+; V6M-NEXT: movs r2, #1
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: subs r1, r2, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: pop {r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %truncshifted = trunc i64 %shifted to i32
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %truncshifted
+ ret i32 %masked
+}
+
+; Shifting happens in 64-bit. Mask is 32-bit, but extended to 64-bit.
+; Masking is 64-bit. Then truncation.
+define i32 @bextr64_32_a2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_a2:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldr r1, [sp]
+; V7M-NEXT: movs r2, #1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_a2:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: mov lr, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: add r12, r3, lr, lsl r12
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: and r0, r12, r0
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_32_a2:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsl.w r1, r1, r12
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_a2:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r7, lr}
+; V6M-NEXT: push {r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldr r1, [sp, #8]
+; V6M-NEXT: movs r2, #1
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: subs r1, r2, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: pop {r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %zextmask = zext i32 %mask to i64
+ %masked = and i64 %zextmask, %shifted
+ %truncmasked = trunc i64 %masked to i32
+ ret i32 %truncmasked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern b. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_b0:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: lsl.w r2, r3, r2
+; V7M-NEXT: bics r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_b0:
+; V7A: @ %bb.0:
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: bic r0, r0, r1, lsl r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_b0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: lsl.w r2, r3, r2
+; V7A-T-NEXT: bics r0, r2
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_b0:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: mvns r1, r1
+; V6M-NEXT: lsls r1, r2
+; V6M-NEXT: bics r0, r1
+; V6M-NEXT: bx lr
+ %shifted = lshr i32 %val, %numskipbits
+ %notmask = shl i32 -1, %numlowbits
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr32_b1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: lsl.w r2, r3, r2
+; V7M-NEXT: bics r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_b1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: bic r0, r0, r1, lsl r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_b1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: lsl.w r2, r3, r2
+; V7A-T-NEXT: bics r0, r2
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_b1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: mvns r1, r1
+; V6M-NEXT: lsls r1, r2
+; V6M-NEXT: bics r0, r1
+; V6M-NEXT: bx lr
+ %skip = zext i8 %numskipbits to i32
+ %shifted = lshr i32 %val, %skip
+ %conv = zext i8 %numlowbits to i32
+ %notmask = shl i32 -1, %conv
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_b2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_b2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsl.w r2, r3, r2
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bics r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_b2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: bic r0, r0, r1, lsl r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_b2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsl.w r2, r3, r2
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bics r0, r2
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_b2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r3, #0
+; V6M-NEXT: mvns r3, r3
+; V6M-NEXT: lsls r3, r2
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bics r0, r3
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %shifted = lshr i32 %val, %numskipbits
+ %notmask = shl i32 -1, %numlowbits
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr32_b3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsl.w r2, r3, r2
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bics r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_b3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: bic r0, r0, r1, lsl r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_b3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsl.w r2, r3, r2
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bics r0, r2
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_b3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r3, #0
+; V6M-NEXT: mvns r3, r3
+; V6M-NEXT: lsls r3, r2
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bics r0, r3
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %skip = zext i8 %numskipbits to i32
+ %shifted = lshr i32 %val, %skip
+ %conv = zext i8 %numlowbits to i32
+ %notmask = shl i32 -1, %conv
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_b4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: lsl.w r2, r3, r2
+; V7M-NEXT: bics r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_b4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: bic r0, r0, r1, lsl r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_b4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: lsl.w r2, r3, r2
+; V7A-T-NEXT: bics r0, r2
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_b4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: mvns r1, r1
+; V6M-NEXT: lsls r1, r2
+; V6M-NEXT: bics r0, r1
+; V6M-NEXT: bx lr
+ %shifted = lshr i32 %val, %numskipbits
+ %notmask = shl i32 -1, %numlowbits
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %shifted, %mask ; swapped order
+ ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_b0:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orrs r0, r3
+; V7M-NEXT: subs.w r3, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: lsl.w r3, r2, r12
+; V7M-NEXT: subs.w lr, r12, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r2, r2, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: bics r1, r2
+; V7M-NEXT: bics r0, r3
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bextr64_b0:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsrpl r0, r1, r3
+; V7A-NEXT: lsr r1, r1, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: subs lr, r12, #32
+; V7A-NEXT: lsl r2, r3, r12
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: bic r0, r0, r2
+; V7A-NEXT: lslpl r3, r3, lr
+; V7A-NEXT: bic r1, r1, r3
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_b0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, r5, r7, lr}
+; V7A-T-NEXT: push {r4, r5, r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: ldr.w r12, [sp, #16]
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r5, r0, r3
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: subs.w lr, r12, #32
+; V7A-T-NEXT: lsl.w r0, r3, r12
+; V7A-T-NEXT: itt pl
+; V7A-T-NEXT: lslpl.w r3, r3, lr
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: subs.w r4, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r5, r1, r4
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: bic.w r0, r5, r0
+; V7A-T-NEXT: bics r1, r3
+; V7A-T-NEXT: pop {r4, r5, r7, pc}
+;
+; V6M-LABEL: bextr64_b0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: mov r5, r1
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: ldr r2, [sp, #16]
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r4, r0
+; V6M-NEXT: bics r5, r1
+; V6M-NEXT: mov r0, r4
+; V6M-NEXT: mov r1, r5
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %notmask = shl i64 -1, %numlowbits
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr64_b1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: lsr.w r12, r0, r2
+; V7M-NEXT: rsb.w r0, r2, #32
+; V7M-NEXT: lsl.w r0, r1, r0
+; V7M-NEXT: orr.w r12, r12, r0
+; V7M-NEXT: subs.w r0, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r12, r1, r0
+; V7M-NEXT: lsr.w r0, r1, r2
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: subs.w r1, r3, #32
+; V7M-NEXT: lsl.w r3, r2, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl r2, r1
+; V7M-NEXT: bic.w r1, r0, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: bic.w r0, r12, r3
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_b1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: lsr r12, r0, r2
+; V7A-NEXT: rsb r0, r2, #32
+; V7A-NEXT: orr r12, r12, r1, lsl r0
+; V7A-NEXT: subs r0, r2, #32
+; V7A-NEXT: lsrpl r12, r1, r0
+; V7A-NEXT: lsr r0, r1, r2
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: subs r1, r3, #32
+; V7A-NEXT: mvn r2, #0
+; V7A-NEXT: lsl r3, r2, r3
+; V7A-NEXT: lslpl r2, r2, r1
+; V7A-NEXT: bic r1, r0, r2
+; V7A-NEXT: movwpl r3, #0
+; V7A-NEXT: bic r0, r12, r3
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_b1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsr.w r12, r0, r2
+; V7A-T-NEXT: rsb.w r0, r2, #32
+; V7A-T-NEXT: lsl.w r0, r1, r0
+; V7A-T-NEXT: orr.w r12, r12, r0
+; V7A-T-NEXT: subs.w r0, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r12, r1, r0
+; V7A-T-NEXT: lsr.w r0, r1, r2
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: subs.w r1, r3, #32
+; V7A-T-NEXT: lsl.w r3, r2, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl r2, r1
+; V7A-T-NEXT: bic.w r1, r0, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r3, #0
+; V7A-T-NEXT: bic.w r0, r12, r3
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_b1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, lr}
+; V6M-NEXT: push {r4, r5, r6, lr}
+; V6M-NEXT: mov r4, r3
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: mov r6, r1
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r5, r0
+; V6M-NEXT: bics r6, r1
+; V6M-NEXT: mov r0, r5
+; V6M-NEXT: mov r1, r6
+; V6M-NEXT: pop {r4, r5, r6, pc}
+ %skip = zext i8 %numskipbits to i64
+ %shifted = lshr i64 %val, %skip
+ %conv = zext i8 %numlowbits to i64
+ %notmask = shl i64 -1, %conv
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_b2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_b2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: ldrd r0, r3, [r0]
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: lsl.w r1, r3, r1
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: orrs r0, r1
+; V7M-NEXT: subs.w r1, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r3, r1
+; V7M-NEXT: lsr.w r1, r3, r2
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: lsl.w r3, r2, r12
+; V7M-NEXT: subs.w lr, r12, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r2, r2, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: bics r1, r2
+; V7M-NEXT: bics r0, r3
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bextr64_b2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: ldrd r0, r1, [r0]
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsrpl r0, r1, r3
+; V7A-NEXT: lsr r1, r1, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: subs lr, r12, #32
+; V7A-NEXT: lsl r2, r3, r12
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: bic r0, r0, r2
+; V7A-NEXT: lslpl r3, r3, lr
+; V7A-NEXT: bic r1, r1, r3
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_b2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: ldrd r0, r3, [r0]
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: lsl.w r1, r3, r1
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: orrs r0, r1
+; V7A-T-NEXT: subs.w r1, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r3, r1
+; V7A-T-NEXT: lsr.w r1, r3, r2
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: lsl.w r2, r3, r12
+; V7A-T-NEXT: subs.w lr, r12, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r3, r3, lr
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: bics r1, r3
+; V7A-T-NEXT: bics r0, r2
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bextr64_b2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: ldr r3, [r0]
+; V6M-NEXT: ldr r1, [r0, #4]
+; V6M-NEXT: mov r0, r3
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: mov r5, r1
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: ldr r2, [sp, #16]
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r4, r0
+; V6M-NEXT: bics r5, r1
+; V6M-NEXT: mov r0, r4
+; V6M-NEXT: mov r1, r5
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %val = load i64, ptr %w
+ %shifted = lshr i64 %val, %numskipbits
+ %notmask = shl i64 -1, %numlowbits
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr64_b3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: ldrd r12, r0, [r0]
+; V7M-NEXT: rsb.w r3, r1, #32
+; V7M-NEXT: lsl.w lr, r0, r3
+; V7M-NEXT: lsr.w r3, r12, r1
+; V7M-NEXT: orr.w r12, r3, lr
+; V7M-NEXT: subs.w r3, r1, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r12, r0, r3
+; V7M-NEXT: lsr.w r0, r0, r1
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: subs.w r1, r2, #32
+; V7M-NEXT: lsl.w r2, r3, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl r3, r1
+; V7M-NEXT: bic.w r1, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: bic.w r0, r12, r2
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bextr64_b3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldm r0, {r0, r3}
+; V7A-NEXT: lsr r12, r0, r1
+; V7A-NEXT: rsb r0, r1, #32
+; V7A-NEXT: orr r12, r12, r3, lsl r0
+; V7A-NEXT: subs r0, r1, #32
+; V7A-NEXT: lsrpl r12, r3, r0
+; V7A-NEXT: lsr r0, r3, r1
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: subs r1, r2, #32
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsl r2, r3, r2
+; V7A-NEXT: lslpl r3, r3, r1
+; V7A-NEXT: bic r1, r0, r3
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: bic r0, r12, r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_b3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: ldrd r12, r3, [r0]
+; V7A-T-NEXT: rsb.w r0, r1, #32
+; V7A-T-NEXT: lsl.w lr, r3, r0
+; V7A-T-NEXT: lsr.w r0, r12, r1
+; V7A-T-NEXT: orr.w r12, r0, lr
+; V7A-T-NEXT: subs.w r0, r1, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r12, r3, r0
+; V7A-T-NEXT: lsr.w r0, r3, r1
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: subs.w r1, r2, #32
+; V7A-T-NEXT: lsl.w r2, r3, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl r3, r1
+; V7A-T-NEXT: bic.w r1, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: bic.w r0, r12, r2
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bextr64_b3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, lr}
+; V6M-NEXT: push {r4, r5, r6, lr}
+; V6M-NEXT: mov r4, r2
+; V6M-NEXT: mov r2, r1
+; V6M-NEXT: ldr r3, [r0]
+; V6M-NEXT: ldr r1, [r0, #4]
+; V6M-NEXT: mov r0, r3
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: mov r6, r1
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r5, r0
+; V6M-NEXT: bics r6, r1
+; V6M-NEXT: mov r0, r5
+; V6M-NEXT: mov r1, r6
+; V6M-NEXT: pop {r4, r5, r6, pc}
+ %val = load i64, ptr %w
+ %skip = zext i8 %numskipbits to i64
+ %shifted = lshr i64 %val, %skip
+ %conv = zext i8 %numlowbits to i64
+ %notmask = shl i64 -1, %conv
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_b4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orrs r0, r3
+; V7M-NEXT: subs.w r3, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: lsl.w r3, r2, r12
+; V7M-NEXT: subs.w lr, r12, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r2, r2, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: bics r1, r2
+; V7M-NEXT: bics r0, r3
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bextr64_b4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsrpl r0, r1, r3
+; V7A-NEXT: lsr r1, r1, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: subs lr, r12, #32
+; V7A-NEXT: lsl r2, r3, r12
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: bic r0, r0, r2
+; V7A-NEXT: lslpl r3, r3, lr
+; V7A-NEXT: bic r1, r1, r3
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_b4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, r5, r7, lr}
+; V7A-T-NEXT: push {r4, r5, r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: ldr.w r12, [sp, #16]
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r5, r0, r3
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: subs.w lr, r12, #32
+; V7A-T-NEXT: lsl.w r0, r3, r12
+; V7A-T-NEXT: itt pl
+; V7A-T-NEXT: lslpl.w r3, r3, lr
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: subs.w r4, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r5, r1, r4
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: bic.w r0, r5, r0
+; V7A-T-NEXT: bics r1, r3
+; V7A-T-NEXT: pop {r4, r5, r7, pc}
+;
+; V6M-LABEL: bextr64_b4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: mov r5, r1
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: ldr r2, [sp, #16]
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r4, r0
+; V6M-NEXT: bics r5, r1
+; V6M-NEXT: mov r0, r4
+; V6M-NEXT: mov r1, r5
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %notmask = shl i64 -1, %numlowbits
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %shifted, %mask ; swapped order
+ ret i64 %masked
+}
+
+; 64-bit, but with 32-bit output
+
+; Everything done in 64-bit, truncation happens last.
+define i32 @bextr64_32_b0(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_b0:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldrb.w r1, [sp]
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: lsls r2, r1
+; V7M-NEXT: subs r1, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: bics r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_b0:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldrb r12, [sp]
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: lsl r1, r1, r12
+; V7A-NEXT: subs r2, r12, #32
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: bic r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_32_b0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsr.w r12, r0, r2
+; V7A-T-NEXT: rsb.w r0, r2, #32
+; V7A-T-NEXT: ldrb.w r3, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r0, r1, r0
+; V7A-T-NEXT: orr.w r0, r0, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: mov.w r1, #-1
+; V7A-T-NEXT: lsls r1, r3
+; V7A-T-NEXT: subs.w r2, r3, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: bics r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_b0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: add r1, sp, #8
+; V6M-NEXT: ldrb r2, [r1]
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r4, r0
+; V6M-NEXT: mov r0, r4
+; V6M-NEXT: pop {r4, pc}
+ %shiftedval = lshr i64 %val, %numskipbits
+ %widenumlowbits = zext i8 %numlowbits to i64
+ %notmask = shl nsw i64 -1, %widenumlowbits
+ %mask = xor i64 %notmask, -1
+ %wideres = and i64 %shiftedval, %mask
+ %res = trunc i64 %wideres to i32
+ ret i32 %res
+}
+
+; Shifting happens in 64-bit, then truncation. Masking is 32-bit.
+define i32 @bextr64_32_b1(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_b1:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldrb.w r1, [sp]
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: bics r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_b1:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldrb r12, [sp]
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: bic r0, r0, r1, lsl r12
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_32_b1:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldrb.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: mov.w r1, #-1
+; V7A-T-NEXT: lsl.w r1, r1, r12
+; V7A-T-NEXT: bics r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_b1:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r7, lr}
+; V6M-NEXT: push {r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: add r1, sp, #8
+; V6M-NEXT: ldrb r1, [r1]
+; V6M-NEXT: movs r2, #0
+; V6M-NEXT: mvns r2, r2
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: bics r0, r2
+; V6M-NEXT: pop {r7, pc}
+ %shiftedval = lshr i64 %val, %numskipbits
+ %truncshiftedval = trunc i64 %shiftedval to i32
+ %widenumlowbits = zext i8 %numlowbits to i32
+ %notmask = shl nsw i32 -1, %widenumlowbits
+ %mask = xor i32 %notmask, -1
+ %res = and i32 %truncshiftedval, %mask
+ ret i32 %res
+}
+
+; Shifting happens in 64-bit. Mask is 32-bit, but extended to 64-bit.
+; Masking is 64-bit. Then truncation.
+define i32 @bextr64_32_b2(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_b2:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldrb.w r1, [sp]
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: bics r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_b2:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldrb r12, [sp]
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: bic r0, r0, r1, lsl r12
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_32_b2:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldrb.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: mov.w r1, #-1
+; V7A-T-NEXT: lsl.w r1, r1, r12
+; V7A-T-NEXT: bics r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_b2:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r7, lr}
+; V6M-NEXT: push {r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: add r1, sp, #8
+; V6M-NEXT: ldrb r1, [r1]
+; V6M-NEXT: movs r2, #0
+; V6M-NEXT: mvns r2, r2
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: bics r0, r2
+; V6M-NEXT: pop {r7, pc}
+ %shiftedval = lshr i64 %val, %numskipbits
+ %widenumlowbits = zext i8 %numlowbits to i32
+ %notmask = shl nsw i32 -1, %widenumlowbits
+ %mask = xor i32 %notmask, -1
+ %zextmask = zext i32 %mask to i64
+ %wideres = and i64 %shiftedval, %zextmask
+ %res = trunc i64 %wideres to i32
+ ret i32 %res
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern c. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_c0:
+; V7M: @ %bb.0:
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_c0:
+; V7A: @ %bb.0:
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_c0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_c0:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r3, #32
+; V6M-NEXT: subs r2, r3, r2
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: lsls r0, r2
+; V6M-NEXT: lsrs r0, r2
+; V6M-NEXT: bx lr
+ %shifted = lshr i32 %val, %numskipbits
+ %numhighbits = sub i32 32, %numlowbits
+ %mask = lshr i32 -1, %numhighbits
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_c1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_c1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_c1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_c1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #32
+; V6M-NEXT: subs r1, r1, r2
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %skip = zext i8 %numskipbits to i32
+ %shifted = lshr i32 %val, %skip
+ %numhighbits = sub i8 32, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i32
+ %mask = lshr i32 -1, %sh_prom
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_c2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_c2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_c2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_c2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_c2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r3, #32
+; V6M-NEXT: subs r2, r3, r2
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: lsls r0, r2
+; V6M-NEXT: lsrs r0, r2
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %shifted = lshr i32 %val, %numskipbits
+ %numhighbits = sub i32 32, %numlowbits
+ %mask = lshr i32 -1, %numhighbits
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_c3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_c3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_c3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_c3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #32
+; V6M-NEXT: subs r1, r1, r2
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %skip = zext i8 %numskipbits to i32
+ %shifted = lshr i32 %val, %skip
+ %numhighbits = sub i8 32, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i32
+ %mask = lshr i32 -1, %sh_prom
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_c4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_c4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_c4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_c4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r3, #32
+; V6M-NEXT: subs r2, r3, r2
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: lsls r0, r2
+; V6M-NEXT: lsrs r0, r2
+; V6M-NEXT: bx lr
+ %shifted = lshr i32 %val, %numskipbits
+ %numhighbits = sub i32 32, %numlowbits
+ %mask = lshr i32 -1, %numhighbits
+ %masked = and i32 %shifted, %mask ; swapped order
+ ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_c0:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: ldr.w r12, [sp]
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orrs r0, r3
+; V7M-NEXT: subs.w r3, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: rsb.w r3, r12, #64
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: lsr.w r3, r2, r3
+; V7M-NEXT: rsbs.w r12, r12, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r2, r2, r12
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_c0:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r5, r11, lr}
+; V7A-NEXT: push {r4, r5, r11, lr}
+; V7A-NEXT: ldr r12, [sp, #16]
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsr r5, r1, r2
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: rsb r4, r12, #64
+; V7A-NEXT: rsbs lr, r12, #32
+; V7A-NEXT: lsr r4, r3, r4
+; V7A-NEXT: lsrpl r3, r3, lr
+; V7A-NEXT: movwpl r4, #0
+; V7A-NEXT: subs lr, r2, #32
+; V7A-NEXT: rsb r2, r2, #32
+; V7A-NEXT: movwpl r5, #0
+; V7A-NEXT: and r12, r4, r5
+; V7A-NEXT: orr r0, r0, r1, lsl r2
+; V7A-NEXT: lsrpl r0, r1, lr
+; V7A-NEXT: mov r1, r12
+; V7A-NEXT: and r0, r3, r0
+; V7A-NEXT: pop {r4, r5, r11, pc}
+;
+; V7A-T-LABEL: bextr64_c0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: mov.w lr, #-1
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orrs r0, r3
+; V7A-T-NEXT: subs.w r3, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r3
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: rsbs.w r2, r12, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl r3, r2
+; V7A-T-NEXT: rsb.w r2, r12, #64
+; V7A-T-NEXT: and.w r0, r0, r3
+; V7A-T-NEXT: lsr.w r2, lr, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: ands r1, r2
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bextr64_c0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: ldr r0, [sp, #16]
+; V6M-NEXT: movs r1, #64
+; V6M-NEXT: subs r2, r1, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %numhighbits = sub i64 64, %numlowbits
+ %mask = lshr i64 -1, %numhighbits
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_c1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: uxtb r2, r2
+; V7M-NEXT: lsr.w r12, r0, r2
+; V7M-NEXT: rsb.w r0, r2, #32
+; V7M-NEXT: lsl.w r0, r1, r0
+; V7M-NEXT: orr.w r12, r12, r0
+; V7M-NEXT: subs.w r0, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r12, r1, r0
+; V7M-NEXT: rsb.w r0, r3, #64
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: uxtb r0, r0
+; V7M-NEXT: subs.w lr, r0, #32
+; V7M-NEXT: lsr.w r2, r3, r0
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r3, r3, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: and.w r0, r3, r12
+; V7M-NEXT: ands r1, r2
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bextr64_c1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, lr}
+; V7A-NEXT: push {r4, lr}
+; V7A-NEXT: uxtb r12, r2
+; V7A-NEXT: lsr lr, r0, r12
+; V7A-NEXT: rsb r0, r12, #32
+; V7A-NEXT: orr r4, lr, r1, lsl r0
+; V7A-NEXT: mvn lr, #31
+; V7A-NEXT: uxtab r2, lr, r2
+; V7A-NEXT: cmp r2, #0
+; V7A-NEXT: lsrpl r4, r1, r2
+; V7A-NEXT: rsb r2, r3, #64
+; V7A-NEXT: lsr r1, r1, r12
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: uxtb r12, r2
+; V7A-NEXT: uxtab r2, lr, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: lsr r0, r3, r12
+; V7A-NEXT: cmp r2, #0
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: and r1, r0, r1
+; V7A-NEXT: lsrpl r3, r3, r2
+; V7A-NEXT: and r0, r3, r4
+; V7A-NEXT: pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_c1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: uxtb.w r12, r2
+; V7A-T-NEXT: lsr.w lr, r0, r12
+; V7A-T-NEXT: rsb.w r0, r12, #32
+; V7A-T-NEXT: lsl.w r0, r1, r0
+; V7A-T-NEXT: orr.w r4, lr, r0
+; V7A-T-NEXT: mvn lr, #31
+; V7A-T-NEXT: uxtab r2, lr, r2
+; V7A-T-NEXT: cmp r2, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r4, r1, r2
+; V7A-T-NEXT: rsb.w r2, r3, #64
+; V7A-T-NEXT: lsr.w r1, r1, r12
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: uxtb.w r12, r2
+; V7A-T-NEXT: uxtab r2, lr, r2
+; V7A-T-NEXT: lsr.w r0, r3, r12
+; V7A-T-NEXT: cmp r2, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: and.w r1, r1, r0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl r3, r2
+; V7A-T-NEXT: and.w r0, r3, r4
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_c1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, lr}
+; V6M-NEXT: push {r4, r5, r6, lr}
+; V6M-NEXT: mov r5, r3
+; V6M-NEXT: uxtb r2, r2
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r6, r0
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: movs r0, #64
+; V6M-NEXT: subs r0, r0, r5
+; V6M-NEXT: uxtb r2, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r6
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r6, pc}
+ %skip = zext i8 %numskipbits to i64
+ %shifted = lshr i64 %val, %skip
+ %numhighbits = sub i8 64, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i64
+ %mask = lshr i64 -1, %sh_prom
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_c2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_c2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldrd r0, r3, [r0]
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: ldr.w r12, [sp]
+; V7M-NEXT: lsl.w r1, r3, r1
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: orrs r0, r1
+; V7M-NEXT: subs.w r1, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r3, r1
+; V7M-NEXT: lsr.w r1, r3, r2
+; V7M-NEXT: rsb.w r3, r12, #64
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: rsbs.w r12, r12, #32
+; V7M-NEXT: lsr.w r3, r2, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r2, r2, r12
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_c2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r6, r8, lr}
+; V7A-NEXT: push {r4, r6, r8, lr}
+; V7A-NEXT: ldr r12, [sp, #16]
+; V7A-NEXT: ldr r3, [r0, #4]
+; V7A-NEXT: rsb r6, r12, #64
+; V7A-NEXT: ldr r8, [r0]
+; V7A-NEXT: mvn r0, #0
+; V7A-NEXT: rsbs r1, r12, #32
+; V7A-NEXT: lsr r6, r0, r6
+; V7A-NEXT: lsr r4, r3, r2
+; V7A-NEXT: lsrpl r0, r0, r1
+; V7A-NEXT: movwpl r6, #0
+; V7A-NEXT: subs r12, r2, #32
+; V7A-NEXT: movwpl r4, #0
+; V7A-NEXT: and r1, r6, r4
+; V7A-NEXT: lsr r6, r8, r2
+; V7A-NEXT: rsb r2, r2, #32
+; V7A-NEXT: orr r2, r6, r3, lsl r2
+; V7A-NEXT: lsrpl r2, r3, r12
+; V7A-NEXT: and r0, r0, r2
+; V7A-NEXT: pop {r4, r6, r8, pc}
+;
+; V7A-T-LABEL: bextr64_c2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldrd r0, r3, [r0]
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: ldr.w r12, [sp]
+; V7A-T-NEXT: lsl.w r1, r3, r1
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: orrs r0, r1
+; V7A-T-NEXT: subs.w r1, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r3, r1
+; V7A-T-NEXT: lsr.w r1, r3, r2
+; V7A-T-NEXT: rsb.w r2, r12, #64
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: rsbs.w r12, r12, #32
+; V7A-T-NEXT: lsr.w r2, r3, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r3, r3, r12
+; V7A-T-NEXT: ands r1, r2
+; V7A-T-NEXT: ands r0, r3
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_c2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: ldr r3, [r0]
+; V6M-NEXT: ldr r1, [r0, #4]
+; V6M-NEXT: mov r0, r3
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: ldr r0, [sp, #16]
+; V6M-NEXT: movs r1, #64
+; V6M-NEXT: subs r2, r1, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %val = load i64, ptr %w
+ %shifted = lshr i64 %val, %numskipbits
+ %numhighbits = sub i64 64, %numlowbits
+ %mask = lshr i64 -1, %numhighbits
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_c3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: ldrd r0, r3, [r0]
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsr.w r12, r0, r1
+; V7M-NEXT: rsb.w r0, r1, #32
+; V7M-NEXT: lsl.w r0, r3, r0
+; V7M-NEXT: orr.w r12, r12, r0
+; V7M-NEXT: subs.w r0, r1, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r12, r3, r0
+; V7M-NEXT: rsb.w r0, r2, #64
+; V7M-NEXT: lsr.w r1, r3, r1
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: uxtb r0, r0
+; V7M-NEXT: subs.w lr, r0, #32
+; V7M-NEXT: lsr.w r2, r3, r0
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r3, r3, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: and.w r0, r3, r12
+; V7M-NEXT: ands r1, r2
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bextr64_c3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, lr}
+; V7A-NEXT: push {r4, lr}
+; V7A-NEXT: ldr r4, [r0]
+; V7A-NEXT: ldr r3, [r0, #4]
+; V7A-NEXT: uxtb r0, r1
+; V7A-NEXT: lsr r12, r4, r0
+; V7A-NEXT: rsb r4, r0, #32
+; V7A-NEXT: lsr r0, r3, r0
+; V7A-NEXT: orr lr, r12, r3, lsl r4
+; V7A-NEXT: mvn r12, #31
+; V7A-NEXT: uxtab r1, r12, r1
+; V7A-NEXT: cmp r1, #0
+; V7A-NEXT: lsrpl lr, r3, r1
+; V7A-NEXT: rsb r1, r2, #64
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: uxtb r2, r1
+; V7A-NEXT: uxtab r4, r12, r1
+; V7A-NEXT: lsr r2, r3, r2
+; V7A-NEXT: cmp r4, #0
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: and r1, r2, r0
+; V7A-NEXT: lsrpl r3, r3, r4
+; V7A-NEXT: and r0, r3, lr
+; V7A-NEXT: pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_c3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, r5, r7, lr}
+; V7A-T-NEXT: push {r4, r5, r7, lr}
+; V7A-T-NEXT: ldrd r12, lr, [r0]
+; V7A-T-NEXT: uxtb r0, r1
+; V7A-T-NEXT: rsb.w r3, r0, #32
+; V7A-T-NEXT: lsl.w r4, lr, r3
+; V7A-T-NEXT: lsr.w r3, r12, r0
+; V7A-T-NEXT: orr.w r5, r3, r4
+; V7A-T-NEXT: mvn r12, #31
+; V7A-T-NEXT: uxtab r1, r12, r1
+; V7A-T-NEXT: lsr.w r0, lr, r0
+; V7A-T-NEXT: cmp r1, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r5, lr, r1
+; V7A-T-NEXT: rsb.w r1, r2, #64
+; V7A-T-NEXT: mov.w r4, #-1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: uxtb r2, r1
+; V7A-T-NEXT: uxtab r3, r12, r1
+; V7A-T-NEXT: lsr.w r2, r4, r2
+; V7A-T-NEXT: cmp r3, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: and.w r1, r2, r0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl r4, r3
+; V7A-T-NEXT: and.w r0, r4, r5
+; V7A-T-NEXT: pop {r4, r5, r7, pc}
+;
+; V6M-LABEL: bextr64_c3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, lr}
+; V6M-NEXT: push {r4, r5, r6, lr}
+; V6M-NEXT: mov r5, r2
+; V6M-NEXT: ldr r4, [r0]
+; V6M-NEXT: ldr r3, [r0, #4]
+; V6M-NEXT: uxtb r2, r1
+; V6M-NEXT: mov r0, r4
+; V6M-NEXT: mov r1, r3
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r6, r0
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: movs r0, #64
+; V6M-NEXT: subs r0, r0, r5
+; V6M-NEXT: uxtb r2, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r6
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r6, pc}
+ %val = load i64, ptr %w
+ %skip = zext i8 %numskipbits to i64
+ %shifted = lshr i64 %val, %skip
+ %numhighbits = sub i8 64, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i64
+ %mask = lshr i64 -1, %sh_prom
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_c4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: ldr.w r12, [sp]
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orrs r0, r3
+; V7M-NEXT: subs.w r3, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: rsb.w r3, r12, #64
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: lsr.w r3, r2, r3
+; V7M-NEXT: rsbs.w r12, r12, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r2, r2, r12
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_c4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r5, r11, lr}
+; V7A-NEXT: push {r4, r5, r11, lr}
+; V7A-NEXT: ldr r12, [sp, #16]
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsr r5, r1, r2
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: rsb r4, r12, #64
+; V7A-NEXT: rsbs lr, r12, #32
+; V7A-NEXT: lsr r4, r3, r4
+; V7A-NEXT: lsrpl r3, r3, lr
+; V7A-NEXT: movwpl r4, #0
+; V7A-NEXT: subs lr, r2, #32
+; V7A-NEXT: rsb r2, r2, #32
+; V7A-NEXT: movwpl r5, #0
+; V7A-NEXT: and r12, r5, r4
+; V7A-NEXT: orr r0, r0, r1, lsl r2
+; V7A-NEXT: lsrpl r0, r1, lr
+; V7A-NEXT: mov r1, r12
+; V7A-NEXT: and r0, r0, r3
+; V7A-NEXT: pop {r4, r5, r11, pc}
+;
+; V7A-T-LABEL: bextr64_c4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: mov.w lr, #-1
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orrs r0, r3
+; V7A-T-NEXT: subs.w r3, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r3
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: rsbs.w r2, r12, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl r3, r2
+; V7A-T-NEXT: rsb.w r2, r12, #64
+; V7A-T-NEXT: and.w r0, r0, r3
+; V7A-T-NEXT: lsr.w r2, lr, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: ands r1, r2
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bextr64_c4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: ldr r0, [sp, #16]
+; V6M-NEXT: movs r1, #64
+; V6M-NEXT: subs r2, r1, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %numhighbits = sub i64 64, %numlowbits
+ %mask = lshr i64 -1, %numhighbits
+ %masked = and i64 %shifted, %mask ; swapped order
+ ret i64 %masked
+}
+
+; 64-bit, but with 32-bit output
+
+; Everything done in 64-bit, truncation happens last.
+define i32 @bextr64_32_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_c0:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldr r1, [sp]
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: rsbs.w r1, r1, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl r2, r1
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_c0:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r3, [sp]
+; V7A-NEXT: rsbs r12, r3, #32
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsrpl r3, r3, r12
+; V7A-NEXT: lsr r12, r0, r2
+; V7A-NEXT: rsb r0, r2, #32
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r12, r1, lsl r0
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: and r0, r3, r0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_32_c0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: rsbs.w r1, r12, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl r2, r1
+; V7A-T-NEXT: ands r0, r2
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_c0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: ldr r0, [sp, #8]
+; V6M-NEXT: movs r1, #64
+; V6M-NEXT: subs r2, r1, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r4
+; V6M-NEXT: pop {r4, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %numhighbits = sub i64 64, %numlowbits
+ %mask = lshr i64 -1, %numhighbits
+ %masked = and i64 %mask, %shifted
+ %res = trunc i64 %masked to i32
+ ret i32 %res
+}
+
+; Shifting happens in 64-bit, then truncation. Masking is 32-bit.
+define i32 @bextr64_32_c1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_c1:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldr r1, [sp]
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_c1:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldr r12, [sp]
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: rsb r1, r12, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_32_c1:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: rsb.w r1, r12, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_c1:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r7, lr}
+; V6M-NEXT: push {r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldr r1, [sp, #8]
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: pop {r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %truncshifted = trunc i64 %shifted to i32
+ %numhighbits = sub i32 32, %numlowbits
+ %mask = lshr i32 -1, %numhighbits
+ %masked = and i32 %mask, %truncshifted
+ ret i32 %masked
+}
+
+; Shifting happens in 64-bit. Mask is 32-bit, but extended to 64-bit.
+; Masking is 64-bit. Then truncation.
+define i32 @bextr64_32_c2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_c2:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldr r1, [sp]
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_c2:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldr r12, [sp]
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: rsb r1, r12, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_32_c2:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: rsb.w r1, r12, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_c2:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r7, lr}
+; V6M-NEXT: push {r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldr r1, [sp, #8]
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: pop {r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %numhighbits = sub i32 32, %numlowbits
+ %mask = lshr i32 -1, %numhighbits
+ %zextmask = zext i32 %mask to i64
+ %masked = and i64 %zextmask, %shifted
+ %truncmasked = trunc i64 %masked to i32
+ ret i32 %truncmasked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern d. 32-bit.
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_d0:
+; V7M: @ %bb.0:
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_d0:
+; V7A: @ %bb.0:
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_d0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_d0:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r3, #32
+; V6M-NEXT: subs r2, r3, r2
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: lsls r0, r2
+; V6M-NEXT: lsrs r0, r2
+; V6M-NEXT: bx lr
+ %shifted = lshr i32 %val, %numskipbits
+ %numhighbits = sub i32 32, %numlowbits
+ %highbitscleared = shl i32 %shifted, %numhighbits
+ %masked = lshr i32 %highbitscleared, %numhighbits
+ ret i32 %masked
+}
+
+define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_d1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_d1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_d1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_d1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #32
+; V6M-NEXT: subs r1, r1, r2
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %skip = zext i8 %numskipbits to i32
+ %shifted = lshr i32 %val, %skip
+ %numhighbits = sub i8 32, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i32
+ %highbitscleared = shl i32 %shifted, %sh_prom
+ %masked = lshr i32 %highbitscleared, %sh_prom
+ ret i32 %masked
+}
+
+define i32 @bextr32_d2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_d2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_d2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_d2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_d2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r3, #32
+; V6M-NEXT: subs r2, r3, r2
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: lsls r0, r2
+; V6M-NEXT: lsrs r0, r2
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %shifted = lshr i32 %val, %numskipbits
+ %numhighbits = sub i32 32, %numlowbits
+ %highbitscleared = shl i32 %shifted, %numhighbits
+ %masked = lshr i32 %highbitscleared, %numhighbits
+ ret i32 %masked
+}
+
+define i32 @bextr32_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_d3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_d3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_d3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_d3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #32
+; V6M-NEXT: subs r1, r1, r2
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %skip = zext i8 %numskipbits to i32
+ %shifted = lshr i32 %val, %skip
+ %numhighbits = sub i8 32, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i32
+ %highbitscleared = shl i32 %shifted, %sh_prom
+ %masked = lshr i32 %highbitscleared, %sh_prom
+ ret i32 %masked
+}
+
+; 64-bit.
+
+define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_d0:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orrs r0, r3
+; V7M-NEXT: subs.w r3, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: rsb.w r3, r12, #64
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: rsb.w lr, r12, #32
+; V7M-NEXT: rsb.w r12, r3, #32
+; V7M-NEXT: lsls r1, r3
+; V7M-NEXT: cmp.w lr, #0
+; V7M-NEXT: lsr.w r4, r0, r12
+; V7M-NEXT: orr.w r1, r1, r4
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, r0, lr
+; V7M-NEXT: lsl.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsl.w r2, r1, r12
+; V7M-NEXT: lsr.w r0, r0, r3
+; V7M-NEXT: orr.w r0, r0, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, lr
+; V7M-NEXT: lsr.w r1, r1, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_d0:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: lsr r3, r1, r2
+; V7A-NEXT: subs lr, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: rsb r2, r2, #32
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: movwpl r3, #0
+; V7A-NEXT: orr r0, r0, r1, lsl r2
+; V7A-NEXT: lsrpl r0, r1, lr
+; V7A-NEXT: rsb r1, r12, #64
+; V7A-NEXT: rsb lr, r1, #32
+; V7A-NEXT: lsr r2, r0, lr
+; V7A-NEXT: orr r2, r2, r3, lsl r1
+; V7A-NEXT: rsbs r3, r12, #32
+; V7A-NEXT: lslpl r2, r0, r3
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: lsr r1, r2, r1
+; V7A-NEXT: orr r0, r0, r2, lsl lr
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: lsrpl r0, r2, r3
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_d0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orrs r0, r3
+; V7A-T-NEXT: subs.w r3, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r3
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: rsb.w r3, r12, #64
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: rsb.w lr, r3, #32
+; V7A-T-NEXT: lsls r1, r3
+; V7A-T-NEXT: rsbs.w r2, r12, #32
+; V7A-T-NEXT: lsr.w r4, r0, lr
+; V7A-T-NEXT: orr.w r1, r1, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r1, r0, r2
+; V7A-T-NEXT: lsl.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsl.w r4, r1, lr
+; V7A-T-NEXT: lsr.w r0, r0, r3
+; V7A-T-NEXT: orr.w r0, r0, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: lsr.w r1, r1, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_d0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldr r2, [sp, #8]
+; V6M-NEXT: movs r3, #64
+; V6M-NEXT: subs r4, r3, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %numhighbits = sub i64 64, %numlowbits
+ %highbitscleared = shl i64 %shifted, %numhighbits
+ %masked = lshr i64 %highbitscleared, %numhighbits
+ ret i64 %masked
+}
+
+define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_d1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: uxtb.w lr, r2
+; V7M-NEXT: subs.w r2, lr, #32
+; V7M-NEXT: lsr.w r12, r0, lr
+; V7M-NEXT: rsb.w r0, lr, #32
+; V7M-NEXT: lsl.w r0, r1, r0
+; V7M-NEXT: orr.w r0, r0, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: rsb.w r2, r3, #64
+; V7M-NEXT: lsr.w r1, r1, lr
+; V7M-NEXT: uxtb r2, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: rsb.w r12, r2, #32
+; V7M-NEXT: lsls r1, r2
+; V7M-NEXT: sub.w r3, r2, #32
+; V7M-NEXT: lsr.w r4, r0, r12
+; V7M-NEXT: orrs r1, r4
+; V7M-NEXT: cmp r3, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, r0, r3
+; V7M-NEXT: lsl.w r0, r0, r2
+; V7M-NEXT: lsl.w r4, r1, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsr.w r0, r0, r2
+; V7M-NEXT: orr.w r0, r0, r4
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_d1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r5, r11, lr}
+; V7A-NEXT: push {r4, r5, r11, lr}
+; V7A-NEXT: uxtb r12, r2
+; V7A-NEXT: lsr lr, r0, r12
+; V7A-NEXT: rsb r0, r12, #32
+; V7A-NEXT: orr r0, lr, r1, lsl r0
+; V7A-NEXT: mvn lr, #31
+; V7A-NEXT: uxtab r2, lr, r2
+; V7A-NEXT: cmp r2, #0
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: rsb r2, r3, #64
+; V7A-NEXT: lsr r1, r1, r12
+; V7A-NEXT: uxtb r3, r2
+; V7A-NEXT: rsb r4, r3, #32
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: uxtab r2, lr, r2
+; V7A-NEXT: lsr r5, r0, r4
+; V7A-NEXT: orr r1, r5, r1, lsl r3
+; V7A-NEXT: cmp r2, #0
+; V7A-NEXT: lslpl r1, r0, r2
+; V7A-NEXT: lsl r0, r0, r3
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lsr r0, r0, r3
+; V7A-NEXT: orr r0, r0, r1, lsl r4
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: lsr r1, r1, r3
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: pop {r4, r5, r11, pc}
+;
+; V7A-T-LABEL: bextr64_d1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, r5, r6, r7, lr}
+; V7A-T-NEXT: push {r4, r5, r6, r7, lr}
+; V7A-T-NEXT: uxtb.w r12, r2
+; V7A-T-NEXT: rsb.w r6, r12, #32
+; V7A-T-NEXT: rsb.w r3, r3, #64
+; V7A-T-NEXT: lsr.w r0, r0, r12
+; V7A-T-NEXT: mvn r7, #31
+; V7A-T-NEXT: uxtab r2, r7, r2
+; V7A-T-NEXT: lsl.w r6, r1, r6
+; V7A-T-NEXT: lsr.w lr, r1, r12
+; V7A-T-NEXT: orrs r0, r6
+; V7A-T-NEXT: cmp r2, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl.w lr, #0
+; V7A-T-NEXT: uxtb r5, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: rsb.w r1, r5, #32
+; V7A-T-NEXT: uxtab r3, r7, r3
+; V7A-T-NEXT: lsl.w r4, lr, r5
+; V7A-T-NEXT: lsr.w r2, r0, r1
+; V7A-T-NEXT: cmp r3, #0
+; V7A-T-NEXT: orr.w r2, r2, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r2, r0, r3
+; V7A-T-NEXT: lsl.w r0, r0, r5
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: lsr.w r0, r0, r5
+; V7A-T-NEXT: orr.w r0, r0, r1
+; V7A-T-NEXT: lsr.w r1, r2, r5
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r2, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: pop {r4, r5, r6, r7, pc}
+;
+; V6M-LABEL: bextr64_d1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: mov r4, r3
+; V6M-NEXT: uxtb r2, r2
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: movs r2, #64
+; V6M-NEXT: subs r2, r2, r4
+; V6M-NEXT: uxtb r4, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, pc}
+ %skip = zext i8 %numskipbits to i64
+ %shifted = lshr i64 %val, %skip
+ %numhighbits = sub i8 64, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i64
+ %highbitscleared = shl i64 %shifted, %sh_prom
+ %masked = lshr i64 %highbitscleared, %sh_prom
+ ret i64 %masked
+}
+
+define i64 @bextr64_d2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_d2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: ldrd r0, r3, [r0]
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: lsl.w r1, r3, r1
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: rsb.w lr, r12, #32
+; V7M-NEXT: orrs r0, r1
+; V7M-NEXT: subs.w r1, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r3, r1
+; V7M-NEXT: rsb.w r1, r12, #64
+; V7M-NEXT: lsr.w r2, r3, r2
+; V7M-NEXT: rsb.w r12, r1, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: cmp.w lr, #0
+; V7M-NEXT: lsl.w r2, r2, r1
+; V7M-NEXT: lsr.w r4, r0, r12
+; V7M-NEXT: orr.w r2, r2, r4
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r2, r0, lr
+; V7M-NEXT: lsl.w r0, r0, r1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsl.w r3, r2, r12
+; V7M-NEXT: lsr.w r0, r0, r1
+; V7M-NEXT: lsr.w r1, r2, r1
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r2, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_d2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: ldrd r0, r1, [r0]
+; V7A-NEXT: subs lr, r2, #32
+; V7A-NEXT: lsr r3, r1, r2
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: movwpl r3, #0
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: rsb r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r2
+; V7A-NEXT: lsrpl r0, r1, lr
+; V7A-NEXT: rsb r1, r12, #64
+; V7A-NEXT: rsb lr, r1, #32
+; V7A-NEXT: lsr r2, r0, lr
+; V7A-NEXT: orr r2, r2, r3, lsl r1
+; V7A-NEXT: rsbs r3, r12, #32
+; V7A-NEXT: lslpl r2, r0, r3
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: lsr r1, r2, r1
+; V7A-NEXT: orr r0, r0, r2, lsl lr
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: lsrpl r0, r2, r3
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_d2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: ldrd r0, r3, [r0]
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: lsl.w r1, r3, r1
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: orrs r0, r1
+; V7A-T-NEXT: subs.w r1, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r3, r1
+; V7A-T-NEXT: lsr.w r2, r3, r2
+; V7A-T-NEXT: rsb.w r1, r12, #64
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: rsb.w lr, r1, #32
+; V7A-T-NEXT: rsbs.w r3, r12, #32
+; V7A-T-NEXT: lsl.w r2, r2, r1
+; V7A-T-NEXT: lsr.w r4, r0, lr
+; V7A-T-NEXT: orr.w r2, r2, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r2, r0, r3
+; V7A-T-NEXT: lsl.w r0, r0, r1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsl.w r4, r2, lr
+; V7A-T-NEXT: lsr.w r0, r0, r1
+; V7A-T-NEXT: lsr.w r1, r2, r1
+; V7A-T-NEXT: orr.w r0, r0, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r2, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_d2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: ldr r3, [r0]
+; V6M-NEXT: ldr r1, [r0, #4]
+; V6M-NEXT: mov r0, r3
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldr r2, [sp, #8]
+; V6M-NEXT: movs r3, #64
+; V6M-NEXT: subs r4, r3, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, pc}
+ %val = load i64, ptr %w
+ %shifted = lshr i64 %val, %numskipbits
+ %numhighbits = sub i64 64, %numlowbits
+ %highbitscleared = shl i64 %shifted, %numhighbits
+ %masked = lshr i64 %highbitscleared, %numhighbits
+ ret i64 %masked
+}
+
+define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_d3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: ldrd r0, lr, [r0]
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: rsb.w r2, r2, #64
+; V7M-NEXT: subs.w r3, r1, #32
+; V7M-NEXT: lsr.w r12, r0, r1
+; V7M-NEXT: rsb.w r0, r1, #32
+; V7M-NEXT: lsr.w r1, lr, r1
+; V7M-NEXT: uxtb r2, r2
+; V7M-NEXT: lsl.w r0, lr, r0
+; V7M-NEXT: orr.w r0, r0, r12
+; V7M-NEXT: rsb.w r12, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, lr, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: lsls r1, r2
+; V7M-NEXT: sub.w r3, r2, #32
+; V7M-NEXT: lsr.w r4, r0, r12
+; V7M-NEXT: orrs r1, r4
+; V7M-NEXT: cmp r3, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, r0, r3
+; V7M-NEXT: lsl.w r0, r0, r2
+; V7M-NEXT: lsl.w r4, r1, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsr.w r0, r0, r2
+; V7M-NEXT: orr.w r0, r0, r4
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_d3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r5, r11, lr}
+; V7A-NEXT: push {r4, r5, r11, lr}
+; V7A-NEXT: ldr r4, [r0]
+; V7A-NEXT: ldr r3, [r0, #4]
+; V7A-NEXT: uxtb r0, r1
+; V7A-NEXT: lsr r12, r4, r0
+; V7A-NEXT: rsb r4, r0, #32
+; V7A-NEXT: lsr r0, r3, r0
+; V7A-NEXT: orr r4, r12, r3, lsl r4
+; V7A-NEXT: mvn r12, #31
+; V7A-NEXT: uxtab r1, r12, r1
+; V7A-NEXT: cmp r1, #0
+; V7A-NEXT: lsrpl r4, r3, r1
+; V7A-NEXT: rsb r1, r2, #64
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: uxtb r2, r1
+; V7A-NEXT: rsb lr, r2, #32
+; V7A-NEXT: uxtab r1, r12, r1
+; V7A-NEXT: lsr r5, r4, lr
+; V7A-NEXT: orr r3, r5, r0, lsl r2
+; V7A-NEXT: cmp r1, #0
+; V7A-NEXT: lsl r0, r4, r2
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lslpl r3, r4, r1
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: orr r0, r0, r3, lsl lr
+; V7A-NEXT: lsrpl r0, r3, r1
+; V7A-NEXT: lsr r1, r3, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: pop {r4, r5, r11, pc}
+;
+; V7A-T-LABEL: bextr64_d3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, r5, r6, lr}
+; V7A-T-NEXT: push {r4, r5, r6, lr}
+; V7A-T-NEXT: ldrd r12, lr, [r0]
+; V7A-T-NEXT: uxtb r0, r1
+; V7A-T-NEXT: rsb.w r6, r0, #32
+; V7A-T-NEXT: lsr.w r3, lr, r0
+; V7A-T-NEXT: rsb.w r2, r2, #64
+; V7A-T-NEXT: mvn r4, #31
+; V7A-T-NEXT: lsr.w r0, r12, r0
+; V7A-T-NEXT: uxtab r1, r4, r1
+; V7A-T-NEXT: lsl.w r6, lr, r6
+; V7A-T-NEXT: orrs r0, r6
+; V7A-T-NEXT: cmp r1, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r3, #0
+; V7A-T-NEXT: uxtb r5, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, lr, r1
+; V7A-T-NEXT: rsb.w r1, r5, #32
+; V7A-T-NEXT: lsls r3, r5
+; V7A-T-NEXT: uxtab r2, r4, r2
+; V7A-T-NEXT: lsr.w r6, r0, r1
+; V7A-T-NEXT: orrs r3, r6
+; V7A-T-NEXT: cmp r2, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r3, r0, r2
+; V7A-T-NEXT: lsl.w r0, r0, r5
+; V7A-T-NEXT: lsl.w r1, r3, r1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsr.w r0, r0, r5
+; V7A-T-NEXT: orr.w r0, r0, r1
+; V7A-T-NEXT: lsr.w r1, r3, r5
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r3, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: pop {r4, r5, r6, pc}
+;
+; V6M-LABEL: bextr64_d3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r2
+; V6M-NEXT: ldr r5, [r0]
+; V6M-NEXT: ldr r3, [r0, #4]
+; V6M-NEXT: uxtb r2, r1
+; V6M-NEXT: mov r0, r5
+; V6M-NEXT: mov r1, r3
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: movs r2, #64
+; V6M-NEXT: subs r2, r2, r4
+; V6M-NEXT: uxtb r4, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %val = load i64, ptr %w
+ %skip = zext i8 %numskipbits to i64
+ %shifted = lshr i64 %val, %skip
+ %numhighbits = sub i8 64, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i64
+ %highbitscleared = shl i64 %shifted, %sh_prom
+ %masked = lshr i64 %highbitscleared, %sh_prom
+ ret i64 %masked
+}
+
+; 64-bit, but with 32-bit output
+
+; Everything done in 64-bit, truncation happens last.
+define i32 @bextr64_32_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_d0:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orrs r0, r3
+; V7M-NEXT: subs.w r3, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: rsb.w r3, r12, #64
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: rsb.w lr, r12, #32
+; V7M-NEXT: rsb.w r12, r3, #32
+; V7M-NEXT: lsls r1, r3
+; V7M-NEXT: cmp.w lr, #0
+; V7M-NEXT: lsr.w r4, r0, r12
+; V7M-NEXT: orr.w r1, r1, r4
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, r0, lr
+; V7M-NEXT: lsl.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsl.w r2, r1, r12
+; V7M-NEXT: lsr.w r0, r0, r3
+; V7M-NEXT: orr.w r0, r0, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, lr
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_32_d0:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: lsr r3, r1, r2
+; V7A-NEXT: subs lr, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: rsb r2, r2, #32
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: movwpl r3, #0
+; V7A-NEXT: orr r0, r0, r1, lsl r2
+; V7A-NEXT: lsrpl r0, r1, lr
+; V7A-NEXT: rsb r1, r12, #64
+; V7A-NEXT: rsb lr, r1, #32
+; V7A-NEXT: lsr r2, r0, lr
+; V7A-NEXT: orr r2, r2, r3, lsl r1
+; V7A-NEXT: rsbs r3, r12, #32
+; V7A-NEXT: lslpl r2, r0, r3
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: orr r0, r0, r2, lsl lr
+; V7A-NEXT: lsrpl r0, r2, r3
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_32_d0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orrs r0, r3
+; V7A-T-NEXT: subs.w r3, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r3
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: rsb.w r3, r12, #64
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: rsb.w lr, r3, #32
+; V7A-T-NEXT: lsls r1, r3
+; V7A-T-NEXT: rsbs.w r2, r12, #32
+; V7A-T-NEXT: lsr.w r4, r0, lr
+; V7A-T-NEXT: orr.w r1, r1, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r1, r0, r2
+; V7A-T-NEXT: lsl.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsl.w r4, r1, lr
+; V7A-T-NEXT: lsr.w r0, r0, r3
+; V7A-T-NEXT: orr.w r0, r0, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_32_d0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldr r2, [sp, #8]
+; V6M-NEXT: movs r3, #64
+; V6M-NEXT: subs r4, r3, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %numhighbits = sub i64 64, %numlowbits
+ %highbitscleared = shl i64 %shifted, %numhighbits
+ %masked = lshr i64 %highbitscleared, %numhighbits
+ %res = trunc i64 %masked to i32
+ ret i32 %res
+}
+
+; Shifting happens in 64-bit, then truncation. Masking is 32-bit.
+define i32 @bextr64_32_d1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_d1:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldr r1, [sp]
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_d1:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldr r12, [sp]
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: rsb r1, r12, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_32_d1:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: rsb.w r1, r12, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_d1:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r7, lr}
+; V6M-NEXT: push {r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldr r1, [sp, #8]
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: pop {r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %truncshifted = trunc i64 %shifted to i32
+ %numhighbits = sub i32 32, %numlowbits
+ %highbitscleared = shl i32 %truncshifted, %numhighbits
+ %masked = lshr i32 %highbitscleared, %numhighbits
+ ret i32 %masked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Constant
+; ---------------------------------------------------------------------------- ;
+
+; https://bugs.llvm.org/show_bug.cgi?id=38938
+define void @pr38938(ptr %a0, ptr %a1) nounwind {
+; V7M-LABEL: pr38938:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r1, [r1]
+; V7M-NEXT: ubfx r1, r1, #21, #10
+; V7M-NEXT: ldr.w r2, [r0, r1, lsl #2]
+; V7M-NEXT: adds r2, #1
+; V7M-NEXT: str.w r2, [r0, r1, lsl #2]
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: pr38938:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r1, [r1]
+; V7A-NEXT: ubfx r1, r1, #21, #10
+; V7A-NEXT: ldr r2, [r0, r1, lsl #2]
+; V7A-NEXT: add r2, r2, #1
+; V7A-NEXT: str r2, [r0, r1, lsl #2]
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: pr38938:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r1, [r1]
+; V7A-T-NEXT: ubfx r1, r1, #21, #10
+; V7A-T-NEXT: ldr.w r2, [r0, r1, lsl #2]
+; V7A-T-NEXT: adds r2, #1
+; V7A-T-NEXT: str.w r2, [r0, r1, lsl #2]
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: pr38938:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r1, [r1]
+; V6M-NEXT: lsrs r1, r1, #19
+; V6M-NEXT: ldr r2, .LCPI51_0
+; V6M-NEXT: ands r2, r1
+; V6M-NEXT: ldr r1, [r0, r2]
+; V6M-NEXT: adds r1, r1, #1
+; V6M-NEXT: str r1, [r0, r2]
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI51_0:
+; V6M-NEXT: .long 4092 @ 0xffc
+ %tmp = load i64, ptr %a1, align 8
+ %tmp1 = lshr i64 %tmp, 21
+ %tmp2 = and i64 %tmp1, 1023
+ %tmp3 = getelementptr inbounds i32, ptr %a0, i64 %tmp2
+ %tmp4 = load i32, ptr %tmp3, align 4
+ %tmp5 = add nsw i32 %tmp4, 1
+ store i32 %tmp5, ptr %tmp3, align 4
+ ret void
+}
+
+; The most canonical variant
+define i32 @c0_i32(i32 %arg) nounwind {
+; V7M-LABEL: c0_i32:
+; V7M: @ %bb.0:
+; V7M-NEXT: ubfx r0, r0, #19, #10
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c0_i32:
+; V7A: @ %bb.0:
+; V7A-NEXT: ubfx r0, r0, #19, #10
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c0_i32:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ubfx r0, r0, #19, #10
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c0_i32:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsls r0, r0, #3
+; V6M-NEXT: lsrs r0, r0, #22
+; V6M-NEXT: bx lr
+ %tmp0 = lshr i32 %arg, 19
+ %tmp1 = and i32 %tmp0, 1023
+ ret i32 %tmp1
+}
+
+; Should be still fine, but the mask is shifted
+define i32 @c1_i32(i32 %arg) nounwind {
+; V7M-LABEL: c1_i32:
+; V7M: @ %bb.0:
+; V7M-NEXT: movw r1, #4092
+; V7M-NEXT: and.w r0, r1, r0, lsr #19
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c1_i32:
+; V7A: @ %bb.0:
+; V7A-NEXT: movw r1, #4092
+; V7A-NEXT: and r0, r1, r0, lsr #19
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c1_i32:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movw r1, #4092
+; V7A-T-NEXT: and.w r0, r1, r0, lsr #19
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c1_i32:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r1, r0, #19
+; V6M-NEXT: ldr r0, .LCPI53_0
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI53_0:
+; V6M-NEXT: .long 4092 @ 0xffc
+ %tmp0 = lshr i32 %arg, 19
+ %tmp1 = and i32 %tmp0, 4092
+ ret i32 %tmp1
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define i32 @c2_i32(i32 %arg) nounwind {
+; V7M-LABEL: c2_i32:
+; V7M: @ %bb.0:
+; V7M-NEXT: movw r1, #4092
+; V7M-NEXT: and.w r0, r1, r0, lsr #17
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c2_i32:
+; V7A: @ %bb.0:
+; V7A-NEXT: movw r1, #4092
+; V7A-NEXT: and r0, r1, r0, lsr #17
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c2_i32:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movw r1, #4092
+; V7A-T-NEXT: and.w r0, r1, r0, lsr #17
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c2_i32:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r1, r0, #17
+; V6M-NEXT: ldr r0, .LCPI54_0
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI54_0:
+; V6M-NEXT: .long 4092 @ 0xffc
+ %tmp0 = lshr i32 %arg, 19
+ %tmp1 = and i32 %tmp0, 1023
+ %tmp2 = shl i32 %tmp1, 2
+ ret i32 %tmp2
+}
+
+; The mask covers newly shifted-in bit
+define i32 @c4_i32_bad(i32 %arg) nounwind {
+; V7M-LABEL: c4_i32_bad:
+; V7M: @ %bb.0:
+; V7M-NEXT: mvn r1, #1
+; V7M-NEXT: and.w r0, r1, r0, lsr #19
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c4_i32_bad:
+; V7A: @ %bb.0:
+; V7A-NEXT: mvn r1, #1
+; V7A-NEXT: and r0, r1, r0, lsr #19
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c4_i32_bad:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mvn r1, #1
+; V7A-T-NEXT: and.w r0, r1, r0, lsr #19
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c4_i32_bad:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r0, #20
+; V6M-NEXT: lsls r0, r0, #1
+; V6M-NEXT: bx lr
+ %tmp0 = lshr i32 %arg, 19
+ %tmp1 = and i32 %tmp0, 16382
+ ret i32 %tmp1
+}
+
+; i64
+
+; The most canonical variant
+define i64 @c0_i64(i64 %arg) nounwind {
+; V7M-LABEL: c0_i64:
+; V7M: @ %bb.0:
+; V7M-NEXT: ubfx r0, r1, #19, #10
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c0_i64:
+; V7A: @ %bb.0:
+; V7A-NEXT: ubfx r0, r1, #19, #10
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c0_i64:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ubfx r0, r1, #19, #10
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c0_i64:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsls r0, r1, #3
+; V6M-NEXT: lsrs r0, r0, #22
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+ %tmp0 = lshr i64 %arg, 51
+ %tmp1 = and i64 %tmp0, 1023
+ ret i64 %tmp1
+}
+
+; Should be still fine, but the mask is shifted
+define i64 @c1_i64(i64 %arg) nounwind {
+; V7M-LABEL: c1_i64:
+; V7M: @ %bb.0:
+; V7M-NEXT: movw r0, #4092
+; V7M-NEXT: and.w r0, r0, r1, lsr #19
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c1_i64:
+; V7A: @ %bb.0:
+; V7A-NEXT: movw r0, #4092
+; V7A-NEXT: and r0, r0, r1, lsr #19
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c1_i64:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movw r0, #4092
+; V7A-T-NEXT: and.w r0, r0, r1, lsr #19
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c1_i64:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r1, r1, #19
+; V6M-NEXT: ldr r0, .LCPI57_0
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI57_0:
+; V6M-NEXT: .long 4092 @ 0xffc
+ %tmp0 = lshr i64 %arg, 51
+ %tmp1 = and i64 %tmp0, 4092
+ ret i64 %tmp1
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define i64 @c2_i64(i64 %arg) nounwind {
+; V7M-LABEL: c2_i64:
+; V7M: @ %bb.0:
+; V7M-NEXT: movw r0, #4092
+; V7M-NEXT: and.w r0, r0, r1, lsr #17
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c2_i64:
+; V7A: @ %bb.0:
+; V7A-NEXT: movw r0, #4092
+; V7A-NEXT: and r0, r0, r1, lsr #17
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c2_i64:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movw r0, #4092
+; V7A-T-NEXT: and.w r0, r0, r1, lsr #17
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c2_i64:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r1, r1, #17
+; V6M-NEXT: ldr r0, .LCPI58_0
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI58_0:
+; V6M-NEXT: .long 4092 @ 0xffc
+ %tmp0 = lshr i64 %arg, 51
+ %tmp1 = and i64 %tmp0, 1023
+ %tmp2 = shl i64 %tmp1, 2
+ ret i64 %tmp2
+}
+
+; The mask covers newly shifted-in bit
+define i64 @c4_i64_bad(i64 %arg) nounwind {
+; V7M-LABEL: c4_i64_bad:
+; V7M: @ %bb.0:
+; V7M-NEXT: mvn r0, #1
+; V7M-NEXT: and.w r0, r0, r1, lsr #19
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c4_i64_bad:
+; V7A: @ %bb.0:
+; V7A-NEXT: mvn r0, #1
+; V7A-NEXT: and r0, r0, r1, lsr #19
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c4_i64_bad:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mvn r0, #1
+; V7A-T-NEXT: and.w r0, r0, r1, lsr #19
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c4_i64_bad:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r1, #20
+; V6M-NEXT: lsls r0, r0, #1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+ %tmp0 = lshr i64 %arg, 51
+ %tmp1 = and i64 %tmp0, 16382
+ ret i64 %tmp1
+}
+
+; ---------------------------------------------------------------------------- ;
+; Constant, storing the result afterwards.
+; ---------------------------------------------------------------------------- ;
+
+; i32
+
+; The most canonical variant
+define void @c5_i32(i32 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c5_i32:
+; V7M: @ %bb.0:
+; V7M-NEXT: ubfx r0, r0, #19, #10
+; V7M-NEXT: str r0, [r1]
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c5_i32:
+; V7A: @ %bb.0:
+; V7A-NEXT: ubfx r0, r0, #19, #10
+; V7A-NEXT: str r0, [r1]
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c5_i32:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ubfx r0, r0, #19, #10
+; V7A-T-NEXT: str r0, [r1]
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c5_i32:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsls r0, r0, #3
+; V6M-NEXT: lsrs r0, r0, #22
+; V6M-NEXT: str r0, [r1]
+; V6M-NEXT: bx lr
+ %tmp0 = lshr i32 %arg, 19
+ %tmp1 = and i32 %tmp0, 1023
+ store i32 %tmp1, ptr %ptr
+ ret void
+}
+
+; Should be still fine, but the mask is shifted
+define void @c6_i32(i32 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c6_i32:
+; V7M: @ %bb.0:
+; V7M-NEXT: ubfx r0, r0, #19, #12
+; V7M-NEXT: str r0, [r1]
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c6_i32:
+; V7A: @ %bb.0:
+; V7A-NEXT: ubfx r0, r0, #19, #12
+; V7A-NEXT: str r0, [r1]
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c6_i32:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ubfx r0, r0, #19, #12
+; V7A-T-NEXT: str r0, [r1]
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c6_i32:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsls r0, r0, #1
+; V6M-NEXT: lsrs r0, r0, #20
+; V6M-NEXT: str r0, [r1]
+; V6M-NEXT: bx lr
+ %tmp0 = lshr i32 %arg, 19
+ %tmp1 = and i32 %tmp0, 4095
+ store i32 %tmp1, ptr %ptr
+ ret void
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define void @c7_i32(i32 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c7_i32:
+; V7M: @ %bb.0:
+; V7M-NEXT: movw r2, #4092
+; V7M-NEXT: and.w r0, r2, r0, lsr #17
+; V7M-NEXT: str r0, [r1]
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c7_i32:
+; V7A: @ %bb.0:
+; V7A-NEXT: movw r2, #4092
+; V7A-NEXT: and r0, r2, r0, lsr #17
+; V7A-NEXT: str r0, [r1]
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c7_i32:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movw r2, #4092
+; V7A-T-NEXT: and.w r0, r2, r0, lsr #17
+; V7A-T-NEXT: str r0, [r1]
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c7_i32:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r0, #17
+; V6M-NEXT: ldr r2, .LCPI62_0
+; V6M-NEXT: ands r2, r0
+; V6M-NEXT: str r2, [r1]
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI62_0:
+; V6M-NEXT: .long 4092 @ 0xffc
+ %tmp0 = lshr i32 %arg, 19
+ %tmp1 = and i32 %tmp0, 1023
+ %tmp2 = shl i32 %tmp1, 2
+ store i32 %tmp2, ptr %ptr
+ ret void
+}
+
+; i64
+
+; The most canonical variant
+define void @c5_i64(i64 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c5_i64:
+; V7M: @ %bb.0:
+; V7M-NEXT: movs r0, #0
+; V7M-NEXT: ubfx r1, r1, #19, #10
+; V7M-NEXT: strd r1, r0, [r2]
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c5_i64:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r0, #0
+; V7A-NEXT: str r0, [r2, #4]
+; V7A-NEXT: ubfx r0, r1, #19, #10
+; V7A-NEXT: str r0, [r2]
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c5_i64:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movs r0, #0
+; V7A-T-NEXT: ubfx r1, r1, #19, #10
+; V7A-T-NEXT: strd r1, r0, [r2]
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c5_i64:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: lsls r1, r1, #3
+; V6M-NEXT: lsrs r1, r1, #22
+; V6M-NEXT: str r1, [r2]
+; V6M-NEXT: str r0, [r2, #4]
+; V6M-NEXT: bx lr
+ %tmp0 = lshr i64 %arg, 51
+ %tmp1 = and i64 %tmp0, 1023
+ store i64 %tmp1, ptr %ptr
+ ret void
+}
+
+; Should be still fine, but the mask is shifted
+define void @c6_i64(i64 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c6_i64:
+; V7M: @ %bb.0:
+; V7M-NEXT: movs r0, #0
+; V7M-NEXT: ubfx r1, r1, #19, #12
+; V7M-NEXT: strd r1, r0, [r2]
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c6_i64:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r0, #0
+; V7A-NEXT: str r0, [r2, #4]
+; V7A-NEXT: ubfx r0, r1, #19, #12
+; V7A-NEXT: str r0, [r2]
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c6_i64:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movs r0, #0
+; V7A-T-NEXT: ubfx r1, r1, #19, #12
+; V7A-T-NEXT: strd r1, r0, [r2]
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c6_i64:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: lsls r1, r1, #1
+; V6M-NEXT: lsrs r1, r1, #20
+; V6M-NEXT: str r1, [r2]
+; V6M-NEXT: str r0, [r2, #4]
+; V6M-NEXT: bx lr
+ %tmp0 = lshr i64 %arg, 51
+ %tmp1 = and i64 %tmp0, 4095
+ store i64 %tmp1, ptr %ptr
+ ret void
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define void @c7_i64(i64 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c7_i64:
+; V7M: @ %bb.0:
+; V7M-NEXT: movs r0, #0
+; V7M-NEXT: movw r3, #4092
+; V7M-NEXT: and.w r1, r3, r1, lsr #17
+; V7M-NEXT: strd r1, r0, [r2]
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c7_i64:
+; V7A: @ %bb.0:
+; V7A-NEXT: movw r0, #4092
+; V7A-NEXT: mov r3, #0
+; V7A-NEXT: and r0, r0, r1, lsr #17
+; V7A-NEXT: stm r2, {r0, r3}
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c7_i64:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movs r0, #0
+; V7A-T-NEXT: movw r3, #4092
+; V7A-T-NEXT: and.w r1, r3, r1, lsr #17
+; V7A-T-NEXT: strd r1, r0, [r2]
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c7_i64:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: lsrs r1, r1, #17
+; V6M-NEXT: ldr r3, .LCPI65_0
+; V6M-NEXT: ands r3, r1
+; V6M-NEXT: str r3, [r2]
+; V6M-NEXT: str r0, [r2, #4]
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI65_0:
+; V6M-NEXT: .long 4092 @ 0xffc
+ %tmp0 = lshr i64 %arg, 51
+ %tmp1 = and i64 %tmp0, 1023
+ %tmp2 = shl i64 %tmp1, 2
+ store i64 %tmp2, ptr %ptr
+ ret void
+}
diff --git a/llvm/test/CodeGen/ARM/extract-lowbits.ll b/llvm/test/CodeGen/ARM/extract-lowbits.ll
new file mode 100644
index 0000000..b483793
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/extract-lowbits.ll
@@ -0,0 +1,2752 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s --check-prefix V7M
+; RUN: llc -mtriple=armv7a-eabi %s -o - | FileCheck %s --check-prefix V7A
+; RUN: llc -mtriple=thumbv7a-eabi %s -o - | FileCheck %s --check-prefix V7A-T
+; RUN: llc -mtriple=armv6m-eabi %s -o - | FileCheck %s --check-prefix V6M
+
+; Patterns:
+; a) x & (1 << nbits) - 1
+; b) x & ~(-1 << nbits)
+; c) x & (-1 >> (32 - y))
+; d) x << (32 - y) >> (32 - y)
+; are equivalent.
+
+; ---------------------------------------------------------------------------- ;
+; Pattern a. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bzhi32_a0(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_a0:
+; V7M: @ %bb.0:
+; V7M-NEXT: movs r2, #1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_a0:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r2, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r1, r3, r2, lsl r1
+; V7A-NEXT: and r0, r1, r0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_a0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movs r2, #1
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_a0:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #1
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: subs r1, r2, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_a1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: movs r2, #1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_a1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r2, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r1, r3, r2, lsl r1
+; V7A-NEXT: and r0, r1, r0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_a1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movs r2, #1
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_a1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #1
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: subs r1, r2, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %conv = zext i8 %numlowbits to i32
+ %onebit = shl i32 1, %conv
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_a2_load(ptr %w, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_a2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: movs r2, #1
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_a2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r2, #1
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r1, r3, r2, lsl r1
+; V7A-NEXT: and r0, r1, r0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_a2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movs r2, #1
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_a2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #1
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: subs r1, r2, #1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_a3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_a3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: movs r2, #1
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_a3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r2, #1
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r1, r3, r2, lsl r1
+; V7A-NEXT: and r0, r1, r0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_a3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movs r2, #1
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_a3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #1
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: subs r1, r2, #1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %conv = zext i8 %numlowbits to i32
+ %onebit = shl i32 1, %conv
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_a4_commutative(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_a4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: movs r2, #1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_a4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r2, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r1, r3, r2, lsl r1
+; V7A-NEXT: and r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_a4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movs r2, #1
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_a4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #1
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: subs r1, r2, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %val, %mask ; swapped order
+ ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bzhi64_a0(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a0:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: mov.w r12, #1
+; V7M-NEXT: subs.w lr, r2, #32
+; V7M-NEXT: lsl.w r2, r12, r2
+; V7M-NEXT: lsr.w r3, r12, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r3, r12, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: subs r2, #1
+; V7M-NEXT: sbc r3, r3, #0
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_a0:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: lsr lr, r12, r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsl r2, r12, r2
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: lslpl lr, r12, r3
+; V7A-NEXT: subs r2, r2, #1
+; V7A-NEXT: sbc r3, lr, #0
+; V7A-NEXT: and r0, r2, r0
+; V7A-NEXT: and r1, r3, r1
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: mov.w r12, #1
+; V7A-T-NEXT: subs.w lr, r2, #32
+; V7A-T-NEXT: lsl.w r2, r12, r2
+; V7A-T-NEXT: lsr.w r3, r12, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r3, r12, lr
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: subs r2, #1
+; V7A-T-NEXT: sbc r3, r3, #0
+; V7A-T-NEXT: ands r0, r2
+; V7A-T-NEXT: ands r1, r3
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, lr}
+; V6M-NEXT: push {r4, r5, r6, lr}
+; V6M-NEXT: mov r5, r1
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r6, #0
+; V6M-NEXT: mov r1, r6
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: subs r0, r0, #1
+; V6M-NEXT: sbcs r1, r6
+; V6M-NEXT: ands r1, r5
+; V6M-NEXT: ands r0, r4
+; V6M-NEXT: pop {r4, r5, r6, pc}
+ %onebit = shl i64 1, %numlowbits
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+; Check that we don't throw away the vreg_width-1 mask if not using shifts
+define i64 @bzhi64_a0_masked(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a0_masked:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: and r2, r2, #63
+; V7M-NEXT: mov.w r12, #1
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: subs.w lr, r2, #32
+; V7M-NEXT: lsl.w r2, r12, r2
+; V7M-NEXT: lsr.w r3, r12, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r3, r12, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: subs r2, #1
+; V7M-NEXT: sbc r3, r3, #0
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_a0_masked:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: and r2, r2, #63
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr lr, r12, r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsl r2, r12, r2
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: lslpl lr, r12, r3
+; V7A-NEXT: subs r2, r2, #1
+; V7A-NEXT: sbc r3, lr, #0
+; V7A-NEXT: and r0, r2, r0
+; V7A-NEXT: and r1, r3, r1
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a0_masked:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: and r2, r2, #63
+; V7A-T-NEXT: mov.w r12, #1
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: subs.w lr, r2, #32
+; V7A-T-NEXT: lsl.w r2, r12, r2
+; V7A-T-NEXT: lsr.w r3, r12, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r3, r12, lr
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: subs r2, #1
+; V7A-T-NEXT: sbc r3, r3, #0
+; V7A-T-NEXT: ands r0, r2
+; V7A-T-NEXT: ands r1, r3
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a0_masked:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, lr}
+; V6M-NEXT: push {r4, r5, r6, lr}
+; V6M-NEXT: mov r5, r1
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #63
+; V6M-NEXT: ands r2, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r6, #0
+; V6M-NEXT: mov r1, r6
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: subs r0, r0, #1
+; V6M-NEXT: sbcs r1, r6
+; V6M-NEXT: ands r1, r5
+; V6M-NEXT: ands r0, r4
+; V6M-NEXT: pop {r4, r5, r6, pc}
+ %numlowbits.masked = and i64 %numlowbits, 63
+ %onebit = shl i64 1, %numlowbits.masked
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_a1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: mov.w r12, #1
+; V7M-NEXT: subs.w lr, r2, #32
+; V7M-NEXT: lsl.w r2, r12, r2
+; V7M-NEXT: lsr.w r3, r12, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r3, r12, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: subs r2, #1
+; V7M-NEXT: sbc r3, r3, #0
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_a1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: lsr lr, r12, r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsl r2, r12, r2
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: lslpl lr, r12, r3
+; V7A-NEXT: subs r2, r2, #1
+; V7A-NEXT: sbc r3, lr, #0
+; V7A-NEXT: and r0, r2, r0
+; V7A-NEXT: and r1, r3, r1
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: mov.w r12, #1
+; V7A-T-NEXT: subs.w lr, r2, #32
+; V7A-T-NEXT: lsl.w r2, r12, r2
+; V7A-T-NEXT: lsr.w r3, r12, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r3, r12, lr
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: subs r2, #1
+; V7A-T-NEXT: sbc r3, r3, #0
+; V7A-T-NEXT: ands r0, r2
+; V7A-T-NEXT: ands r1, r3
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, lr}
+; V6M-NEXT: push {r4, r5, r6, lr}
+; V6M-NEXT: mov r5, r1
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r6, #0
+; V6M-NEXT: mov r1, r6
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: subs r0, r0, #1
+; V6M-NEXT: sbcs r1, r6
+; V6M-NEXT: ands r1, r5
+; V6M-NEXT: ands r0, r4
+; V6M-NEXT: pop {r4, r5, r6, pc}
+ %conv = zext i8 %numlowbits to i64
+ %onebit = shl i64 1, %conv
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_a2_load(ptr %w, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: movs r3, #1
+; V7M-NEXT: subs.w r12, r2, #32
+; V7M-NEXT: lsl.w r2, r3, r2
+; V7M-NEXT: lsr.w r1, r3, r1
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, r3, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: subs r2, #1
+; V7M-NEXT: ldrd r0, r3, [r0]
+; V7M-NEXT: sbc r1, r1, #0
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_a2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r6, r11, lr}
+; V7A-NEXT: push {r4, r6, r11, lr}
+; V7A-NEXT: ldr r6, [r0]
+; V7A-NEXT: mov r1, #1
+; V7A-NEXT: ldr r3, [r0, #4]
+; V7A-NEXT: rsb r0, r2, #32
+; V7A-NEXT: subs r4, r2, #32
+; V7A-NEXT: lsr r0, r1, r0
+; V7A-NEXT: lslpl r0, r1, r4
+; V7A-NEXT: lsl r1, r1, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: subs r2, r1, #1
+; V7A-NEXT: sbc r0, r0, #0
+; V7A-NEXT: and r1, r0, r3
+; V7A-NEXT: and r0, r2, r6
+; V7A-NEXT: pop {r4, r6, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: ldrd r12, lr, [r0]
+; V7A-T-NEXT: subs.w r0, r2, #32
+; V7A-T-NEXT: lsr.w r3, r1, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r3, r1, r0
+; V7A-T-NEXT: lsl.w r0, r1, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: subs r0, #1
+; V7A-T-NEXT: sbc r1, r3, #0
+; V7A-T-NEXT: and.w r0, r0, r12
+; V7A-T-NEXT: and.w r1, r1, lr
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r5, #0
+; V6M-NEXT: mov r1, r5
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: subs r2, r0, #1
+; V6M-NEXT: sbcs r1, r5
+; V6M-NEXT: ldm r4!, {r0, r3}
+; V6M-NEXT: ands r1, r3
+; V6M-NEXT: ands r0, r2
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %val = load i64, ptr %w
+ %onebit = shl i64 1, %numlowbits
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_a3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r2, r1, #32
+; V7M-NEXT: movs r3, #1
+; V7M-NEXT: subs.w r12, r1, #32
+; V7M-NEXT: lsl.w r1, r3, r1
+; V7M-NEXT: lsr.w r2, r3, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r2, r3, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: subs r3, r1, #1
+; V7M-NEXT: sbc r1, r2, #0
+; V7M-NEXT: ldrd r0, r2, [r0]
+; V7M-NEXT: ands r1, r2
+; V7M-NEXT: ands r0, r3
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_a3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r6, r11, lr}
+; V7A-NEXT: push {r4, r6, r11, lr}
+; V7A-NEXT: ldr r6, [r0]
+; V7A-NEXT: mov r2, #1
+; V7A-NEXT: ldr r3, [r0, #4]
+; V7A-NEXT: rsb r0, r1, #32
+; V7A-NEXT: subs r4, r1, #32
+; V7A-NEXT: lsl r1, r2, r1
+; V7A-NEXT: lsr r0, r2, r0
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: lslpl r0, r2, r4
+; V7A-NEXT: subs r2, r1, #1
+; V7A-NEXT: sbc r0, r0, #0
+; V7A-NEXT: and r1, r0, r3
+; V7A-NEXT: and r0, r2, r6
+; V7A-NEXT: pop {r4, r6, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r1, #32
+; V7A-T-NEXT: movs r2, #1
+; V7A-T-NEXT: ldrd r12, lr, [r0]
+; V7A-T-NEXT: subs.w r0, r1, #32
+; V7A-T-NEXT: lsr.w r3, r2, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r3, r2, r0
+; V7A-T-NEXT: lsl.w r0, r2, r1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: subs r0, #1
+; V7A-T-NEXT: sbc r1, r3, #0
+; V7A-T-NEXT: and.w r0, r0, r12
+; V7A-T-NEXT: and.w r1, r1, lr
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r2, r1
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r5, #0
+; V6M-NEXT: mov r1, r5
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: subs r2, r0, #1
+; V6M-NEXT: sbcs r1, r5
+; V6M-NEXT: ldm r4!, {r0, r3}
+; V6M-NEXT: ands r1, r3
+; V6M-NEXT: ands r0, r2
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %val = load i64, ptr %w
+ %conv = zext i8 %numlowbits to i64
+ %onebit = shl i64 1, %conv
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_a4_commutative(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: mov.w r12, #1
+; V7M-NEXT: subs.w lr, r2, #32
+; V7M-NEXT: lsl.w r2, r12, r2
+; V7M-NEXT: lsr.w r3, r12, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r3, r12, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: subs r2, #1
+; V7M-NEXT: sbc r3, r3, #0
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_a4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: lsr lr, r12, r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsl r2, r12, r2
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: lslpl lr, r12, r3
+; V7A-NEXT: subs r2, r2, #1
+; V7A-NEXT: sbc r3, lr, #0
+; V7A-NEXT: and r0, r0, r2
+; V7A-NEXT: and r1, r1, r3
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: mov.w r12, #1
+; V7A-T-NEXT: subs.w lr, r2, #32
+; V7A-T-NEXT: lsl.w r2, r12, r2
+; V7A-T-NEXT: lsr.w r3, r12, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r3, r12, lr
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: subs r2, #1
+; V7A-T-NEXT: sbc r3, r3, #0
+; V7A-T-NEXT: ands r0, r2
+; V7A-T-NEXT: ands r1, r3
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, lr}
+; V6M-NEXT: push {r4, r5, r6, lr}
+; V6M-NEXT: mov r5, r1
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r6, #0
+; V6M-NEXT: mov r1, r6
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: subs r0, r0, #1
+; V6M-NEXT: sbcs r1, r6
+; V6M-NEXT: ands r1, r5
+; V6M-NEXT: ands r0, r4
+; V6M-NEXT: pop {r4, r5, r6, pc}
+ %onebit = shl i64 1, %numlowbits
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %val, %mask ; swapped order
+ ret i64 %masked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern b. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bzhi32_b0(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_b0:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: bics r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_b0:
+; V7A: @ %bb.0:
+; V7A-NEXT: mvn r2, #0
+; V7A-NEXT: bic r0, r0, r2, lsl r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_b0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: bics r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_b0:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #0
+; V6M-NEXT: mvns r2, r2
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: bics r0, r2
+; V6M-NEXT: bx lr
+ %notmask = shl i32 -1, %numlowbits
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_b1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: bics r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_b1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: mvn r2, #0
+; V7A-NEXT: bic r0, r0, r2, lsl r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_b1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: bics r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_b1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #0
+; V6M-NEXT: mvns r2, r2
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: bics r0, r2
+; V6M-NEXT: bx lr
+ %conv = zext i8 %numlowbits to i32
+ %notmask = shl i32 -1, %conv
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_b2_load(ptr %w, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_b2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: bics r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_b2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mvn r2, #0
+; V7A-NEXT: bic r0, r0, r2, lsl r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_b2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: bics r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_b2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #0
+; V6M-NEXT: mvns r2, r2
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: bics r0, r2
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %notmask = shl i32 -1, %numlowbits
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_b3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_b3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: bics r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_b3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mvn r2, #0
+; V7A-NEXT: bic r0, r0, r2, lsl r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_b3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: bics r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_b3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #0
+; V6M-NEXT: mvns r2, r2
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: bics r0, r2
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %conv = zext i8 %numlowbits to i32
+ %notmask = shl i32 -1, %conv
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_b4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: bics r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_b4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: mvn r2, #0
+; V7A-NEXT: bic r0, r0, r2, lsl r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_b4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: bics r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_b4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #0
+; V6M-NEXT: mvns r2, r2
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: bics r0, r2
+; V6M-NEXT: bx lr
+ %notmask = shl i32 -1, %numlowbits
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %val, %mask ; swapped order
+ ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bzhi64_b0(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_b0:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsl.w r12, r3, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl.w r12, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl r3, r2
+; V7M-NEXT: bic.w r0, r0, r12
+; V7M-NEXT: bics r1, r3
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_b0:
+; V7A: @ %bb.0:
+; V7A-NEXT: subs r12, r2, #32
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsl r2, r3, r2
+; V7A-NEXT: lslpl r3, r3, r12
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: bic r1, r1, r3
+; V7A-NEXT: bic r0, r0, r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_b0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsl.w r12, r3, r2
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl.w r12, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl r3, r2
+; V7A-T-NEXT: bic.w r0, r0, r12
+; V7A-T-NEXT: bics r1, r3
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_b0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r5, r0
+; V6M-NEXT: bics r4, r1
+; V6M-NEXT: mov r0, r5
+; V6M-NEXT: mov r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %notmask = shl i64 -1, %numlowbits
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_b1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_b1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsl.w r12, r3, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl.w r12, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl r3, r2
+; V7M-NEXT: bic.w r0, r0, r12
+; V7M-NEXT: bics r1, r3
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_b1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: subs r12, r2, #32
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsl r2, r3, r2
+; V7A-NEXT: lslpl r3, r3, r12
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: bic r1, r1, r3
+; V7A-NEXT: bic r0, r0, r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_b1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsl.w r12, r3, r2
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl.w r12, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl r3, r2
+; V7A-T-NEXT: bic.w r0, r0, r12
+; V7A-T-NEXT: bics r1, r3
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_b1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r5, r0
+; V6M-NEXT: bics r4, r1
+; V6M-NEXT: mov r0, r5
+; V6M-NEXT: mov r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %conv = zext i8 %numlowbits to i64
+ %notmask = shl i64 -1, %conv
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_b2_load(ptr %w, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_b2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r1, #-1
+; V7M-NEXT: subs.w r12, r2, #32
+; V7M-NEXT: lsl.w r3, r1, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: ldrd r0, r2, [r0]
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, r1, r12
+; V7M-NEXT: bics r0, r3
+; V7M-NEXT: bic.w r1, r2, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_b2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, lr}
+; V7A-NEXT: push {r4, lr}
+; V7A-NEXT: ldr r4, [r0]
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: ldr r3, [r0, #4]
+; V7A-NEXT: subs r0, r2, #32
+; V7A-NEXT: lsl r2, r1, r2
+; V7A-NEXT: lslpl r1, r1, r0
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: bic r1, r3, r1
+; V7A-NEXT: bic r0, r4, r2
+; V7A-NEXT: pop {r4, pc}
+;
+; V7A-T-LABEL: bzhi64_b2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r1, #-1
+; V7A-T-NEXT: ldrd r0, r12, [r0]
+; V7A-T-NEXT: lsl.w r3, r1, r2
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r3, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl r1, r2
+; V7A-T-NEXT: bics r0, r3
+; V7A-T-NEXT: bic.w r1, r12, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_b2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: ldm r4!, {r2, r3}
+; V6M-NEXT: bics r2, r0
+; V6M-NEXT: bics r3, r1
+; V6M-NEXT: mov r0, r2
+; V6M-NEXT: mov r1, r3
+; V6M-NEXT: pop {r4, pc}
+ %val = load i64, ptr %w
+ %notmask = shl i64 -1, %numlowbits
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_b3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_b3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: subs.w r12, r1, #32
+; V7M-NEXT: lsl.w r3, r2, r1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: ldrd r0, r1, [r0]
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r2, r2, r12
+; V7M-NEXT: bics r1, r2
+; V7M-NEXT: bics r0, r3
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_b3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r6, r11, lr}
+; V7A-NEXT: push {r4, r6, r11, lr}
+; V7A-NEXT: mvn r2, #0
+; V7A-NEXT: ldr r6, [r0]
+; V7A-NEXT: ldr r3, [r0, #4]
+; V7A-NEXT: subs r0, r1, #32
+; V7A-NEXT: lsl r4, r2, r1
+; V7A-NEXT: lslpl r2, r2, r0
+; V7A-NEXT: movwpl r4, #0
+; V7A-NEXT: bic r1, r3, r2
+; V7A-NEXT: bic r0, r6, r4
+; V7A-NEXT: pop {r4, r6, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_b3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: ldrd r0, r12, [r0]
+; V7A-T-NEXT: lsl.w r3, r2, r1
+; V7A-T-NEXT: subs r1, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r3, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl r2, r1
+; V7A-T-NEXT: bics r0, r3
+; V7A-T-NEXT: bic.w r1, r12, r2
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_b3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: mov r2, r1
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: ldm r4!, {r2, r3}
+; V6M-NEXT: bics r2, r0
+; V6M-NEXT: bics r3, r1
+; V6M-NEXT: mov r0, r2
+; V6M-NEXT: mov r1, r3
+; V6M-NEXT: pop {r4, pc}
+ %val = load i64, ptr %w
+ %conv = zext i8 %numlowbits to i64
+ %notmask = shl i64 -1, %conv
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_b4_commutative(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_b4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsl.w r12, r3, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl.w r12, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl r3, r2
+; V7M-NEXT: bic.w r0, r0, r12
+; V7M-NEXT: bics r1, r3
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_b4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: subs r12, r2, #32
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsl r2, r3, r2
+; V7A-NEXT: lslpl r3, r3, r12
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: bic r1, r1, r3
+; V7A-NEXT: bic r0, r0, r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_b4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsl.w r12, r3, r2
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl.w r12, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl r3, r2
+; V7A-T-NEXT: bic.w r0, r0, r12
+; V7A-T-NEXT: bics r1, r3
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_b4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r5, r0
+; V6M-NEXT: bics r4, r1
+; V6M-NEXT: mov r0, r5
+; V6M-NEXT: mov r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %notmask = shl i64 -1, %numlowbits
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %val, %mask ; swapped order
+ ret i64 %masked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern c. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_c0:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_c0:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_c0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_c0:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %numhighbits = sub i32 32, %numlowbits
+ %mask = lshr i32 -1, %numhighbits
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_c1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_c1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_c1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_c1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %numhighbits = sub i8 32, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i32
+ %mask = lshr i32 -1, %sh_prom
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_c2_load(ptr %w, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_c2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_c2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_c2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_c2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %numhighbits = sub i32 32, %numlowbits
+ %mask = lshr i32 -1, %numhighbits
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_c3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_c3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_c3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_c3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %numhighbits = sub i8 32, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i32
+ %mask = lshr i32 -1, %sh_prom
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_c4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_c4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_c4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_c4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %numhighbits = sub i32 32, %numlowbits
+ %mask = lshr i32 -1, %numhighbits
+ %masked = and i32 %val, %mask ; swapped order
+ ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_c0:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsbs.w lr, r2, #32
+; V7M-NEXT: rsb.w r2, r2, #64
+; V7M-NEXT: mov.w r12, #-1
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsr.w r2, r12, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r3, r3, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: ands r0, r3
+; V7M-NEXT: ands r1, r2
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_c0:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsbs lr, r2, #32
+; V7A-NEXT: rsb r2, r2, #64
+; V7A-NEXT: mvn r12, #0
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsr r2, r12, r2
+; V7A-NEXT: lsrpl r3, r3, lr
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: and r0, r3, r0
+; V7A-NEXT: and r1, r2, r1
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_c0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsbs.w lr, r2, #32
+; V7A-T-NEXT: rsb.w r2, r2, #64
+; V7A-T-NEXT: mov.w r12, #-1
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsr.w r2, r12, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r3, r3, lr
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: ands r0, r3
+; V7A-T-NEXT: ands r1, r2
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_c0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: movs r0, #64
+; V6M-NEXT: subs r2, r0, r2
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %numhighbits = sub i64 64, %numlowbits
+ %mask = lshr i64 -1, %numhighbits
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_c1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r2, r2, #64
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: uxtb r2, r2
+; V7M-NEXT: subs.w r12, r2, #32
+; V7M-NEXT: lsr.w r2, r3, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r3, r3, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: ands r0, r3
+; V7M-NEXT: ands r1, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_c1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsb lr, r2, #64
+; V7A-NEXT: mvn r2, #31
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: uxtb r12, lr
+; V7A-NEXT: uxtab r2, r2, lr
+; V7A-NEXT: lsr r12, r3, r12
+; V7A-NEXT: cmp r2, #0
+; V7A-NEXT: movwpl r12, #0
+; V7A-NEXT: lsrpl r3, r3, r2
+; V7A-NEXT: and r1, r12, r1
+; V7A-NEXT: and r0, r3, r0
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_c1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w lr, r2, #64
+; V7A-T-NEXT: mvn r2, #31
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: uxtb.w r12, lr
+; V7A-T-NEXT: uxtab r2, r2, lr
+; V7A-T-NEXT: lsr.w r12, r3, r12
+; V7A-T-NEXT: cmp r2, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl.w r12, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl r3, r2
+; V7A-T-NEXT: and.w r1, r1, r12
+; V7A-T-NEXT: ands r0, r3
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_c1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: movs r0, #64
+; V6M-NEXT: subs r0, r0, r2
+; V6M-NEXT: uxtb r2, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %numhighbits = sub i8 64, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i64
+ %mask = lshr i64 -1, %sh_prom
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_c2_load(ptr %w, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_c2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsbs.w r1, r2, #32
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: rsb.w r2, r2, #64
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl r3, r1
+; V7M-NEXT: ldrd r0, r1, [r0]
+; V7M-NEXT: mov.w r12, #-1
+; V7M-NEXT: lsr.w r2, r12, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: ands r0, r3
+; V7M-NEXT: ands r1, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_c2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r5, lr}
+; V7A-NEXT: push {r5, lr}
+; V7A-NEXT: rsbs r1, r2, #32
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: mvn r12, #0
+; V7A-NEXT: ldm r0, {r0, r5}
+; V7A-NEXT: lsrpl r3, r3, r1
+; V7A-NEXT: rsb r1, r2, #64
+; V7A-NEXT: and r0, r3, r0
+; V7A-NEXT: lsr r1, r12, r1
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: and r1, r1, r5
+; V7A-NEXT: pop {r5, pc}
+;
+; V7A-T-LABEL: bzhi64_c2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsbs.w r1, r2, #32
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: ldrd r0, lr, [r0]
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl r3, r1
+; V7A-T-NEXT: rsb.w r1, r2, #64
+; V7A-T-NEXT: mov.w r12, #-1
+; V7A-T-NEXT: and.w r0, r0, r3
+; V7A-T-NEXT: lsr.w r1, r12, r1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: and.w r1, r1, lr
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_c2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #64
+; V6M-NEXT: subs r2, r0, r2
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldm r4!, {r2, r3}
+; V6M-NEXT: ands r0, r2
+; V6M-NEXT: ands r1, r3
+; V6M-NEXT: pop {r4, pc}
+ %val = load i64, ptr %w
+ %numhighbits = sub i64 64, %numlowbits
+ %mask = lshr i64 -1, %numhighbits
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_c3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #64
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: subs.w r2, r1, #32
+; V7M-NEXT: lsr.w r1, r3, r1
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl r3, r2
+; V7M-NEXT: ldrd r0, r2, [r0]
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: ands r1, r2
+; V7M-NEXT: ands r0, r3
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_c3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r6, r11, lr}
+; V7A-NEXT: push {r4, r6, r11, lr}
+; V7A-NEXT: rsb r1, r1, #64
+; V7A-NEXT: mvn r4, #31
+; V7A-NEXT: mvn r2, #0
+; V7A-NEXT: ldr r6, [r0]
+; V7A-NEXT: ldr r3, [r0, #4]
+; V7A-NEXT: uxtb r0, r1
+; V7A-NEXT: uxtab r4, r4, r1
+; V7A-NEXT: lsr r0, r2, r0
+; V7A-NEXT: cmp r4, #0
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: and r1, r0, r3
+; V7A-NEXT: lsrpl r2, r2, r4
+; V7A-NEXT: and r0, r2, r6
+; V7A-NEXT: pop {r4, r6, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_c3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r1, r1, #64
+; V7A-T-NEXT: mvn r3, #31
+; V7A-T-NEXT: ldrd r12, lr, [r0]
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: uxtb r0, r1
+; V7A-T-NEXT: uxtab r3, r3, r1
+; V7A-T-NEXT: lsr.w r0, r2, r0
+; V7A-T-NEXT: cmp r3, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: and.w r1, r0, lr
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl r2, r3
+; V7A-T-NEXT: and.w r0, r2, r12
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_c3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #64
+; V6M-NEXT: subs r0, r0, r1
+; V6M-NEXT: uxtb r2, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldm r4!, {r2, r3}
+; V6M-NEXT: ands r0, r2
+; V6M-NEXT: ands r1, r3
+; V6M-NEXT: pop {r4, pc}
+ %val = load i64, ptr %w
+ %numhighbits = sub i8 64, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i64
+ %mask = lshr i64 -1, %sh_prom
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_c4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsbs.w lr, r2, #32
+; V7M-NEXT: rsb.w r2, r2, #64
+; V7M-NEXT: mov.w r12, #-1
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsr.w r2, r12, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r3, r3, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: ands r0, r3
+; V7M-NEXT: ands r1, r2
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_c4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsbs lr, r2, #32
+; V7A-NEXT: rsb r2, r2, #64
+; V7A-NEXT: mvn r12, #0
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsr r2, r12, r2
+; V7A-NEXT: lsrpl r3, r3, lr
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: and r0, r0, r3
+; V7A-NEXT: and r1, r1, r2
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_c4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsbs.w lr, r2, #32
+; V7A-T-NEXT: rsb.w r2, r2, #64
+; V7A-T-NEXT: mov.w r12, #-1
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsr.w r2, r12, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r3, r3, lr
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: ands r0, r3
+; V7A-T-NEXT: ands r1, r2
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_c4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: movs r0, #64
+; V6M-NEXT: subs r2, r0, r2
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %numhighbits = sub i64 64, %numlowbits
+ %mask = lshr i64 -1, %numhighbits
+ %masked = and i64 %val, %mask ; swapped order
+ ret i64 %masked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern d. 32-bit.
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_d0:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_d0:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_d0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_d0:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %numhighbits = sub i32 32, %numlowbits
+ %highbitscleared = shl i32 %val, %numhighbits
+ %masked = lshr i32 %highbitscleared, %numhighbits
+ ret i32 %masked
+}
+
+define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_d1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_d1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_d1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_d1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %numhighbits = sub i8 32, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i32
+ %highbitscleared = shl i32 %val, %sh_prom
+ %masked = lshr i32 %highbitscleared, %sh_prom
+ ret i32 %masked
+}
+
+define i32 @bzhi32_d2_load(ptr %w, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_d2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_d2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_d2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_d2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %numhighbits = sub i32 32, %numlowbits
+ %highbitscleared = shl i32 %val, %numhighbits
+ %masked = lshr i32 %highbitscleared, %numhighbits
+ ret i32 %masked
+}
+
+define i32 @bzhi32_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_d3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_d3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_d3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_d3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %numhighbits = sub i8 32, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i32
+ %highbitscleared = shl i32 %val, %sh_prom
+ %masked = lshr i32 %highbitscleared, %sh_prom
+ ret i32 %masked
+}
+
+; 64-bit.
+
+define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_d0:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsb.w r3, r2, #64
+; V7M-NEXT: rsbs.w r2, r2, #32
+; V7M-NEXT: rsb.w lr, r3, #32
+; V7M-NEXT: lsl.w r12, r1, r3
+; V7M-NEXT: lsr.w r1, r0, lr
+; V7M-NEXT: orr.w r1, r1, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, r0, r2
+; V7M-NEXT: lsl.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsl.w r12, r1, lr
+; V7M-NEXT: lsr.w r0, r0, r3
+; V7M-NEXT: orr.w r0, r0, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: lsr.w r1, r1, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_d0:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsb lr, r2, #64
+; V7A-NEXT: rsbs r2, r2, #32
+; V7A-NEXT: rsb r12, lr, #32
+; V7A-NEXT: lsr r3, r0, r12
+; V7A-NEXT: orr r1, r3, r1, lsl lr
+; V7A-NEXT: lslpl r1, r0, r2
+; V7A-NEXT: lsl r0, r0, lr
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lsr r0, r0, lr
+; V7A-NEXT: orr r0, r0, r1, lsl r12
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: lsr r1, r1, lr
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_d0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #64
+; V7A-T-NEXT: rsbs.w r2, r2, #32
+; V7A-T-NEXT: rsb.w lr, r3, #32
+; V7A-T-NEXT: lsl.w r12, r1, r3
+; V7A-T-NEXT: lsr.w r1, r0, lr
+; V7A-T-NEXT: orr.w r1, r1, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r1, r0, r2
+; V7A-T-NEXT: lsl.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsl.w r12, r1, lr
+; V7A-T-NEXT: lsr.w r0, r0, r3
+; V7A-T-NEXT: orr.w r0, r0, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: lsr.w r1, r1, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_d0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: movs r3, #64
+; V6M-NEXT: subs r4, r3, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, pc}
+ %numhighbits = sub i64 64, %numlowbits
+ %highbitscleared = shl i64 %val, %numhighbits
+ %masked = lshr i64 %highbitscleared, %numhighbits
+ ret i64 %masked
+}
+
+define i64 @bzhi64_d1_indexzext(i64 %val, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_d1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r2, r2, #64
+; V7M-NEXT: uxtb r2, r2
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsl.w r12, r1, r2
+; V7M-NEXT: lsr.w r1, r0, r3
+; V7M-NEXT: orr.w r1, r1, r12
+; V7M-NEXT: subs.w r12, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, r0, r12
+; V7M-NEXT: lsl.w r0, r0, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: lsr.w r0, r0, r2
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r12
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_d1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsb lr, r2, #64
+; V7A-NEXT: uxtb r3, lr
+; V7A-NEXT: rsb r12, r3, #32
+; V7A-NEXT: lsr r2, r0, r12
+; V7A-NEXT: orr r1, r2, r1, lsl r3
+; V7A-NEXT: mvn r2, #31
+; V7A-NEXT: uxtab r2, r2, lr
+; V7A-NEXT: cmp r2, #0
+; V7A-NEXT: lslpl r1, r0, r2
+; V7A-NEXT: lsl r0, r0, r3
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lsr r0, r0, r3
+; V7A-NEXT: orr r0, r0, r1, lsl r12
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: lsr r1, r1, r3
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_d1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: rsb.w r4, r2, #64
+; V7A-T-NEXT: mvn r2, #31
+; V7A-T-NEXT: uxtb r3, r4
+; V7A-T-NEXT: rsb.w lr, r3, #32
+; V7A-T-NEXT: lsl.w r12, r1, r3
+; V7A-T-NEXT: uxtab r2, r2, r4
+; V7A-T-NEXT: lsr.w r1, r0, lr
+; V7A-T-NEXT: cmp r2, #0
+; V7A-T-NEXT: orr.w r1, r1, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r1, r0, r2
+; V7A-T-NEXT: lsl.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsl.w r4, r1, lr
+; V7A-T-NEXT: lsr.w r0, r0, r3
+; V7A-T-NEXT: orr.w r0, r0, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: lsr.w r1, r1, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bzhi64_d1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: movs r3, #64
+; V6M-NEXT: subs r2, r3, r2
+; V6M-NEXT: uxtb r4, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, pc}
+ %numhighbits = sub i8 64, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i64
+ %highbitscleared = shl i64 %val, %sh_prom
+ %masked = lshr i64 %highbitscleared, %sh_prom
+ ret i64 %masked
+}
+
+define i64 @bzhi64_d2_load(ptr %w, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_d2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsb.w r1, r2, #64
+; V7M-NEXT: ldrd r0, r3, [r0]
+; V7M-NEXT: rsb.w lr, r1, #32
+; V7M-NEXT: rsbs.w r2, r2, #32
+; V7M-NEXT: lsl.w r12, r3, r1
+; V7M-NEXT: lsr.w r3, r0, lr
+; V7M-NEXT: orr.w r3, r3, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r3, r0, r2
+; V7M-NEXT: lsl.w r0, r0, r1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsl.w r12, r3, lr
+; V7M-NEXT: lsr.w r0, r0, r1
+; V7M-NEXT: lsr.w r1, r3, r1
+; V7M-NEXT: orr.w r0, r0, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r3, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_d2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r5, r7, r11, lr}
+; V7A-NEXT: push {r5, r7, r11, lr}
+; V7A-NEXT: rsb r3, r2, #64
+; V7A-NEXT: ldm r0, {r0, r7}
+; V7A-NEXT: rsb r1, r3, #32
+; V7A-NEXT: rsbs r2, r2, #32
+; V7A-NEXT: lsr r5, r0, r1
+; V7A-NEXT: orr r7, r5, r7, lsl r3
+; V7A-NEXT: lslpl r7, r0, r2
+; V7A-NEXT: lsl r0, r0, r3
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lsr r0, r0, r3
+; V7A-NEXT: orr r0, r0, r7, lsl r1
+; V7A-NEXT: lsr r1, r7, r3
+; V7A-NEXT: lsrpl r0, r7, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: pop {r5, r7, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_d2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #64
+; V7A-T-NEXT: ldrd r0, r1, [r0]
+; V7A-T-NEXT: rsb.w lr, r3, #32
+; V7A-T-NEXT: rsbs.w r2, r2, #32
+; V7A-T-NEXT: lsl.w r12, r1, r3
+; V7A-T-NEXT: lsr.w r1, r0, lr
+; V7A-T-NEXT: orr.w r1, r1, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r1, r0, r2
+; V7A-T-NEXT: lsl.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsl.w r12, r1, lr
+; V7A-T-NEXT: lsr.w r0, r0, r3
+; V7A-T-NEXT: orr.w r0, r0, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: lsr.w r1, r1, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_d2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: movs r1, #64
+; V6M-NEXT: subs r4, r1, r2
+; V6M-NEXT: ldr r2, [r0]
+; V6M-NEXT: ldr r1, [r0, #4]
+; V6M-NEXT: mov r0, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, pc}
+ %val = load i64, ptr %w
+ %numhighbits = sub i64 64, %numlowbits
+ %highbitscleared = shl i64 %val, %numhighbits
+ %masked = lshr i64 %highbitscleared, %numhighbits
+ ret i64 %masked
+}
+
+define i64 @bzhi64_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_d3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #64
+; V7M-NEXT: ldrd r0, r2, [r0]
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: rsb.w r3, r1, #32
+; V7M-NEXT: lsl.w r12, r2, r1
+; V7M-NEXT: lsr.w r2, r0, r3
+; V7M-NEXT: orr.w r2, r2, r12
+; V7M-NEXT: subs.w r12, r1, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r2, r0, r12
+; V7M-NEXT: lsl.w r0, r0, r1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsl.w r3, r2, r3
+; V7M-NEXT: lsr.w r0, r0, r1
+; V7M-NEXT: lsr.w r1, r2, r1
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r2, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_d3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r5, r7, r11, lr}
+; V7A-NEXT: push {r5, r7, r11, lr}
+; V7A-NEXT: rsb r1, r1, #64
+; V7A-NEXT: ldm r0, {r0, r7}
+; V7A-NEXT: uxtb r2, r1
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r5, r0, r3
+; V7A-NEXT: orr r7, r5, r7, lsl r2
+; V7A-NEXT: mvn r5, #31
+; V7A-NEXT: uxtab r1, r5, r1
+; V7A-NEXT: cmp r1, #0
+; V7A-NEXT: lslpl r7, r0, r1
+; V7A-NEXT: lsl r0, r0, r2
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: orr r0, r0, r7, lsl r3
+; V7A-NEXT: lsrpl r0, r7, r1
+; V7A-NEXT: lsr r1, r7, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: pop {r5, r7, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_d3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: rsb.w r4, r1, #64
+; V7A-T-NEXT: ldrd r0, r2, [r0]
+; V7A-T-NEXT: mvn r1, #31
+; V7A-T-NEXT: uxtb r3, r4
+; V7A-T-NEXT: rsb.w lr, r3, #32
+; V7A-T-NEXT: lsl.w r12, r2, r3
+; V7A-T-NEXT: uxtab r1, r1, r4
+; V7A-T-NEXT: lsr.w r2, r0, lr
+; V7A-T-NEXT: cmp r1, #0
+; V7A-T-NEXT: orr.w r2, r2, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r2, r0, r1
+; V7A-T-NEXT: lsl.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsl.w r4, r2, lr
+; V7A-T-NEXT: lsr.w r0, r0, r3
+; V7A-T-NEXT: orr.w r0, r0, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r2, r1
+; V7A-T-NEXT: lsr.w r1, r2, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bzhi64_d3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: movs r2, #64
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: uxtb r4, r1
+; V6M-NEXT: ldr r2, [r0]
+; V6M-NEXT: ldr r1, [r0, #4]
+; V6M-NEXT: mov r0, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, pc}
+ %val = load i64, ptr %w
+ %numhighbits = sub i8 64, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i64
+ %highbitscleared = shl i64 %val, %sh_prom
+ %masked = lshr i64 %highbitscleared, %sh_prom
+ ret i64 %masked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Constant mask
+; ---------------------------------------------------------------------------- ;
+
+; 32-bit
+
+define i32 @bzhi32_constant_mask32(i32 %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask32:
+; V7M: @ %bb.0:
+; V7M-NEXT: bic r0, r0, #-2147483648
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask32:
+; V7A: @ %bb.0:
+; V7A-NEXT: bic r0, r0, #-2147483648
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask32:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: bic r0, r0, #-2147483648
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask32:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r1, #1
+; V6M-NEXT: lsls r1, r1, #31
+; V6M-NEXT: bics r0, r1
+; V6M-NEXT: bx lr
+ %masked = and i32 %val, 2147483647
+ ret i32 %masked
+}
+
+define i32 @bzhi32_constant_mask32_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask32_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: bic r0, r0, #-2147483648
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask32_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: bic r0, r0, #-2147483648
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask32_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: bic r0, r0, #-2147483648
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask32_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r1, #1
+; V6M-NEXT: lsls r1, r1, #31
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: bics r0, r1
+; V6M-NEXT: bx lr
+ %val1 = load i32, ptr %val
+ %masked = and i32 %val1, 2147483647
+ ret i32 %masked
+}
+
+define i32 @bzhi32_constant_mask16(i32 %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask16:
+; V7M: @ %bb.0:
+; V7M-NEXT: bfc r0, #15, #17
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask16:
+; V7A: @ %bb.0:
+; V7A-NEXT: bfc r0, #15, #17
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask16:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: bfc r0, #15, #17
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask16:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r1, .LCPI41_0
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI41_0:
+; V6M-NEXT: .long 32767 @ 0x7fff
+ %masked = and i32 %val, 32767
+ ret i32 %masked
+}
+
+define i32 @bzhi32_constant_mask16_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask16_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: bfc r0, #15, #17
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask16_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: bfc r0, #15, #17
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask16_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: bfc r0, #15, #17
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask16_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r1, [r0]
+; V6M-NEXT: ldr r0, .LCPI42_0
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI42_0:
+; V6M-NEXT: .long 32767 @ 0x7fff
+ %val1 = load i32, ptr %val
+ %masked = and i32 %val1, 32767
+ ret i32 %masked
+}
+
+define i32 @bzhi32_constant_mask8(i32 %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask8:
+; V7M: @ %bb.0:
+; V7M-NEXT: and r0, r0, #127
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask8:
+; V7A: @ %bb.0:
+; V7A-NEXT: and r0, r0, #127
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask8:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: and r0, r0, #127
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask8:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r1, #127
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %masked = and i32 %val, 127
+ ret i32 %masked
+}
+
+define i32 @bzhi32_constant_mask8_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask8_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: and r0, r0, #127
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask8_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: and r0, r0, #127
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask8_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: and r0, r0, #127
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask8_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r1, [r0]
+; V6M-NEXT: movs r0, #127
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %val1 = load i32, ptr %val
+ %masked = and i32 %val1, 127
+ ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bzhi64_constant_mask64(i64 %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask64:
+; V7M: @ %bb.0:
+; V7M-NEXT: bic r1, r1, #-1073741824
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask64:
+; V7A: @ %bb.0:
+; V7A-NEXT: bic r1, r1, #-1073741824
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask64:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: bic r1, r1, #-1073741824
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask64:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #3
+; V6M-NEXT: lsls r2, r2, #30
+; V6M-NEXT: bics r1, r2
+; V6M-NEXT: bx lr
+ %masked = and i64 %val, 4611686018427387903
+ ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask64_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask64_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldrd r0, r1, [r0]
+; V7M-NEXT: bic r1, r1, #-1073741824
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask64_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldrd r0, r1, [r0]
+; V7A-NEXT: bic r1, r1, #-1073741824
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask64_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldrd r0, r1, [r0]
+; V7A-T-NEXT: bic r1, r1, #-1073741824
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask64_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r1, #3
+; V6M-NEXT: lsls r3, r1, #30
+; V6M-NEXT: ldr r2, [r0]
+; V6M-NEXT: ldr r1, [r0, #4]
+; V6M-NEXT: bics r1, r3
+; V6M-NEXT: mov r0, r2
+; V6M-NEXT: bx lr
+ %val1 = load i64, ptr %val
+ %masked = and i64 %val1, 4611686018427387903
+ ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask32(i64 %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask32:
+; V7M: @ %bb.0:
+; V7M-NEXT: bic r0, r0, #-2147483648
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask32:
+; V7A: @ %bb.0:
+; V7A-NEXT: bic r0, r0, #-2147483648
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask32:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: bic r0, r0, #-2147483648
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask32:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r1, #1
+; V6M-NEXT: lsls r1, r1, #31
+; V6M-NEXT: bics r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+ %masked = and i64 %val, 2147483647
+ ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask32_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask32_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bic r0, r0, #-2147483648
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask32_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bic r0, r0, #-2147483648
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask32_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bic r0, r0, #-2147483648
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask32_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r1, #1
+; V6M-NEXT: lsls r1, r1, #31
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: bics r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+ %val1 = load i64, ptr %val
+ %masked = and i64 %val1, 2147483647
+ ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask16(i64 %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask16:
+; V7M: @ %bb.0:
+; V7M-NEXT: bfc r0, #15, #17
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask16:
+; V7A: @ %bb.0:
+; V7A-NEXT: bfc r0, #15, #17
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask16:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: bfc r0, #15, #17
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask16:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r1, .LCPI49_0
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI49_0:
+; V6M-NEXT: .long 32767 @ 0x7fff
+ %masked = and i64 %val, 32767
+ ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask16_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask16_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bfc r0, #15, #17
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask16_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bfc r0, #15, #17
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask16_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bfc r0, #15, #17
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask16_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r1, [r0]
+; V6M-NEXT: ldr r0, .LCPI50_0
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI50_0:
+; V6M-NEXT: .long 32767 @ 0x7fff
+ %val1 = load i64, ptr %val
+ %masked = and i64 %val1, 32767
+ ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask8(i64 %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask8:
+; V7M: @ %bb.0:
+; V7M-NEXT: and r0, r0, #127
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask8:
+; V7A: @ %bb.0:
+; V7A-NEXT: and r0, r0, #127
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask8:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: and r0, r0, #127
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask8:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r1, #127
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+ %masked = and i64 %val, 127
+ ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask8_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask8_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: and r0, r0, #127
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask8_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: and r0, r0, #127
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask8_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: and r0, r0, #127
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask8_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r1, [r0]
+; V6M-NEXT: movs r0, #127
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+ %val1 = load i64, ptr %val
+ %masked = and i64 %val1, 127
+ ret i64 %masked
+}
diff --git a/llvm/test/CodeGen/X86/isel-fpclass.ll b/llvm/test/CodeGen/X86/isel-fpclass.ll
index 960bbf5..df04b67 100644
--- a/llvm/test/CodeGen/X86/isel-fpclass.ll
+++ b/llvm/test/CodeGen/X86/isel-fpclass.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86-SDAGISEL
+; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86
; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-SDAGISEL
; RUN: llc < %s -mtriple=i686-linux -fast-isel -fast-isel-abort=1 | FileCheck %s -check-prefixes=X86-FASTISEL
; RUN: llc < %s -mtriple=x86_64-linux -fast-isel -fast-isel-abort=1 | FileCheck %s -check-prefixes=X64,X64-FASTISEL
+; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=2 | FileCheck %s -check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux -global-isel -global-isel-abort=2 | FileCheck %s -check-prefixes=X64,X64-GISEL
-; FIXME: We can reuse/delete llvm/test/CodeGen/X86/is_fpclass.ll when all patches are included.
-
-define i1 @isnone_f(float %x) {
-; X86-SDAGISEL-LABEL: isnone_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: xorl %eax, %eax
-; X86-SDAGISEL-NEXT: retl
+define i1 @isnone_f(float %x) nounwind {
+; X86-LABEL: isnone_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: retl
;
; X64-LABEL: isnone_f:
; X64: # %bb.0: # %entry
@@ -28,11 +28,11 @@ entry:
ret i1 %0
}
-define i1 @isany_f(float %x) {
-; X86-SDAGISEL-LABEL: isany_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: movb $1, %al
-; X86-SDAGISEL-NEXT: retl
+define i1 @isany_f(float %x) nounwind {
+; X86-LABEL: isany_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movb $1, %al
+; X86-NEXT: retl
;
; X64-LABEL: isany_f:
; X64: # %bb.0: # %entry
@@ -50,17 +50,17 @@ entry:
ret i1 %0
}
-define i1 @issignaling_f(float %x) {
-; X86-SDAGISEL-LABEL: issignaling_f:
-; X86-SDAGISEL: # %bb.0:
-; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-SDAGISEL-NEXT: setl %cl
-; X86-SDAGISEL-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001
-; X86-SDAGISEL-NEXT: setge %al
-; X86-SDAGISEL-NEXT: andb %cl, %al
-; X86-SDAGISEL-NEXT: retl
+define i1 @issignaling_f(float %x) nounwind {
+; X86-LABEL: issignaling_f:
+; X86: # %bb.0:
+; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT: setl %cl
+; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT: setge %al
+; X86-NEXT: andb %cl, %al
+; X86-NEXT: retl
;
; X64-LABEL: issignaling_f:
; X64: # %bb.0:
@@ -76,7 +76,6 @@ define i1 @issignaling_f(float %x) {
; X86-FASTISEL-LABEL: issignaling_f:
; X86-FASTISEL: # %bb.0:
; X86-FASTISEL-NEXT: pushl %eax
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8
; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp)
; X86-FASTISEL-NEXT: fstps (%esp)
; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
@@ -87,20 +86,19 @@ define i1 @issignaling_f(float %x) {
; X86-FASTISEL-NEXT: setge %al
; X86-FASTISEL-NEXT: andb %cl, %al
; X86-FASTISEL-NEXT: popl %ecx
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4
; X86-FASTISEL-NEXT: retl
%a0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1) ; "snan"
ret i1 %a0
}
- define i1 @isquiet_f(float %x) {
-; X86-SDAGISEL-LABEL: isquiet_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-SDAGISEL-NEXT: setge %al
-; X86-SDAGISEL-NEXT: retl
+ define i1 @isquiet_f(float %x) nounwind {
+; X86-LABEL: isquiet_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT: setge %al
+; X86-NEXT: retl
;
; X64-LABEL: isquiet_f:
; X64: # %bb.0: # %entry
@@ -113,7 +111,6 @@ define i1 @issignaling_f(float %x) {
; X86-FASTISEL-LABEL: isquiet_f:
; X86-FASTISEL: # %bb.0: # %entry
; X86-FASTISEL-NEXT: pushl %eax
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8
; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp)
; X86-FASTISEL-NEXT: fstps (%esp)
; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
@@ -121,21 +118,20 @@ define i1 @issignaling_f(float %x) {
; X86-FASTISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
; X86-FASTISEL-NEXT: setge %al
; X86-FASTISEL-NEXT: popl %ecx
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4
; X86-FASTISEL-NEXT: retl
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 2) ; "qnan"
ret i1 %0
}
-define i1 @not_isquiet_f(float %x) {
-; X86-SDAGISEL-LABEL: not_isquiet_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-SDAGISEL-NEXT: setl %al
-; X86-SDAGISEL-NEXT: retl
+define i1 @not_isquiet_f(float %x) nounwind {
+; X86-LABEL: not_isquiet_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT: setl %al
+; X86-NEXT: retl
;
; X64-LABEL: not_isquiet_f:
; X64: # %bb.0: # %entry
@@ -148,7 +144,6 @@ define i1 @not_isquiet_f(float %x) {
; X86-FASTISEL-LABEL: not_isquiet_f:
; X86-FASTISEL: # %bb.0: # %entry
; X86-FASTISEL-NEXT: pushl %eax
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8
; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp)
; X86-FASTISEL-NEXT: fstps (%esp)
; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
@@ -156,21 +151,20 @@ define i1 @not_isquiet_f(float %x) {
; X86-FASTISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
; X86-FASTISEL-NEXT: setl %al
; X86-FASTISEL-NEXT: popl %ecx
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4
; X86-FASTISEL-NEXT: retl
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1021) ; ~"qnan"
ret i1 %0
}
-define i1 @isinf_f(float %x) {
-; X86-SDAGISEL-LABEL: isinf_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT: sete %al
-; X86-SDAGISEL-NEXT: retl
+define i1 @isinf_f(float %x) nounwind {
+; X86-LABEL: isinf_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT: sete %al
+; X86-NEXT: retl
;
; X64-LABEL: isinf_f:
; X64: # %bb.0: # %entry
@@ -183,7 +177,6 @@ define i1 @isinf_f(float %x) {
; X86-FASTISEL-LABEL: isinf_f:
; X86-FASTISEL: # %bb.0: # %entry
; X86-FASTISEL-NEXT: pushl %eax
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8
; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp)
; X86-FASTISEL-NEXT: fstps (%esp)
; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
@@ -191,21 +184,20 @@ define i1 @isinf_f(float %x) {
; X86-FASTISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
; X86-FASTISEL-NEXT: sete %al
; X86-FASTISEL-NEXT: popl %ecx
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4
; X86-FASTISEL-NEXT: retl
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 516) ; 0x204 = "inf"
ret i1 %0
}
-define i1 @not_isinf_f(float %x) {
-; X86-SDAGISEL-LABEL: not_isinf_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT: setne %al
-; X86-SDAGISEL-NEXT: retl
+define i1 @not_isinf_f(float %x) nounwind {
+; X86-LABEL: not_isinf_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT: setne %al
+; X86-NEXT: retl
;
; X64-LABEL: not_isinf_f:
; X64: # %bb.0: # %entry
@@ -218,7 +210,6 @@ define i1 @not_isinf_f(float %x) {
; X86-FASTISEL-LABEL: not_isinf_f:
; X86-FASTISEL: # %bb.0: # %entry
; X86-FASTISEL-NEXT: pushl %eax
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8
; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp)
; X86-FASTISEL-NEXT: fstps (%esp)
; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
@@ -226,19 +217,18 @@ define i1 @not_isinf_f(float %x) {
; X86-FASTISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
; X86-FASTISEL-NEXT: setne %al
; X86-FASTISEL-NEXT: popl %ecx
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4
; X86-FASTISEL-NEXT: retl
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 507) ; ~0x204 = "~inf"
ret i1 %0
}
-define i1 @is_plus_inf_f(float %x) {
-; X86-SDAGISEL-LABEL: is_plus_inf_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; X86-SDAGISEL-NEXT: sete %al
-; X86-SDAGISEL-NEXT: retl
+define i1 @is_plus_inf_f(float %x) nounwind {
+; X86-LABEL: is_plus_inf_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT: sete %al
+; X86-NEXT: retl
;
; X64-LABEL: is_plus_inf_f:
; X64: # %bb.0: # %entry
@@ -250,25 +240,23 @@ define i1 @is_plus_inf_f(float %x) {
; X86-FASTISEL-LABEL: is_plus_inf_f:
; X86-FASTISEL: # %bb.0: # %entry
; X86-FASTISEL-NEXT: pushl %eax
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8
; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp)
; X86-FASTISEL-NEXT: fstps (%esp)
; X86-FASTISEL-NEXT: cmpl $2139095040, (%esp) # imm = 0x7F800000
; X86-FASTISEL-NEXT: sete %al
; X86-FASTISEL-NEXT: popl %ecx
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4
; X86-FASTISEL-NEXT: retl
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 512) ; 0x200 = "+inf"
ret i1 %0
}
-define i1 @is_minus_inf_f(float %x) {
-; X86-SDAGISEL-LABEL: is_minus_inf_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
-; X86-SDAGISEL-NEXT: sete %al
-; X86-SDAGISEL-NEXT: retl
+define i1 @is_minus_inf_f(float %x) nounwind {
+; X86-LABEL: is_minus_inf_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-NEXT: sete %al
+; X86-NEXT: retl
;
; X64-LABEL: is_minus_inf_f:
; X64: # %bb.0: # %entry
@@ -280,25 +268,23 @@ define i1 @is_minus_inf_f(float %x) {
; X86-FASTISEL-LABEL: is_minus_inf_f:
; X86-FASTISEL: # %bb.0: # %entry
; X86-FASTISEL-NEXT: pushl %eax
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8
; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp)
; X86-FASTISEL-NEXT: fstps (%esp)
; X86-FASTISEL-NEXT: cmpl $-8388608, (%esp) # imm = 0xFF800000
; X86-FASTISEL-NEXT: sete %al
; X86-FASTISEL-NEXT: popl %ecx
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4
; X86-FASTISEL-NEXT: retl
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 4) ; "-inf"
ret i1 %0
}
-define i1 @not_is_minus_inf_f(float %x) {
-; X86-SDAGISEL-LABEL: not_is_minus_inf_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
-; X86-SDAGISEL-NEXT: setne %al
-; X86-SDAGISEL-NEXT: retl
+define i1 @not_is_minus_inf_f(float %x) nounwind {
+; X86-LABEL: not_is_minus_inf_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-NEXT: setne %al
+; X86-NEXT: retl
;
; X64-LABEL: not_is_minus_inf_f:
; X64: # %bb.0: # %entry
@@ -310,27 +296,25 @@ define i1 @not_is_minus_inf_f(float %x) {
; X86-FASTISEL-LABEL: not_is_minus_inf_f:
; X86-FASTISEL: # %bb.0: # %entry
; X86-FASTISEL-NEXT: pushl %eax
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8
; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp)
; X86-FASTISEL-NEXT: fstps (%esp)
; X86-FASTISEL-NEXT: cmpl $-8388608, (%esp) # imm = 0xFF800000
; X86-FASTISEL-NEXT: setne %al
; X86-FASTISEL-NEXT: popl %ecx
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4
; X86-FASTISEL-NEXT: retl
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1019) ; ~"-inf"
ret i1 %0
}
-define i1 @isfinite_f(float %x) {
-; X86-SDAGISEL-LABEL: isfinite_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT: setl %al
-; X86-SDAGISEL-NEXT: retl
+define i1 @isfinite_f(float %x) nounwind {
+; X86-LABEL: isfinite_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT: setl %al
+; X86-NEXT: retl
;
; X64-LABEL: isfinite_f:
; X64: # %bb.0: # %entry
@@ -343,7 +327,6 @@ define i1 @isfinite_f(float %x) {
; X86-FASTISEL-LABEL: isfinite_f:
; X86-FASTISEL: # %bb.0: # %entry
; X86-FASTISEL-NEXT: pushl %eax
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8
; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp)
; X86-FASTISEL-NEXT: fstps (%esp)
; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
@@ -351,21 +334,20 @@ define i1 @isfinite_f(float %x) {
; X86-FASTISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
; X86-FASTISEL-NEXT: setl %al
; X86-FASTISEL-NEXT: popl %ecx
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4
; X86-FASTISEL-NEXT: retl
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504) ; 0x1f8 = "finite"
ret i1 %0
}
-define i1 @not_isfinite_f(float %x) {
-; X86-SDAGISEL-LABEL: not_isfinite_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT: setge %al
-; X86-SDAGISEL-NEXT: retl
+define i1 @not_isfinite_f(float %x) nounwind {
+; X86-LABEL: not_isfinite_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT: setge %al
+; X86-NEXT: retl
;
; X64-LABEL: not_isfinite_f:
; X64: # %bb.0: # %entry
@@ -378,7 +360,6 @@ define i1 @not_isfinite_f(float %x) {
; X86-FASTISEL-LABEL: not_isfinite_f:
; X86-FASTISEL: # %bb.0: # %entry
; X86-FASTISEL-NEXT: pushl %eax
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8
; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp)
; X86-FASTISEL-NEXT: fstps (%esp)
; X86-FASTISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
@@ -386,19 +367,18 @@ define i1 @not_isfinite_f(float %x) {
; X86-FASTISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
; X86-FASTISEL-NEXT: setge %al
; X86-FASTISEL-NEXT: popl %ecx
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4
; X86-FASTISEL-NEXT: retl
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519) ; ~0x1f8 = "~finite"
ret i1 %0
}
-define i1 @is_plus_finite_f(float %x) {
-; X86-SDAGISEL-LABEL: is_plus_finite_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; X86-SDAGISEL-NEXT: setb %al
-; X86-SDAGISEL-NEXT: retl
+define i1 @is_plus_finite_f(float %x) nounwind {
+; X86-LABEL: is_plus_finite_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT: setb %al
+; X86-NEXT: retl
;
; X64-LABEL: is_plus_finite_f:
; X64: # %bb.0: # %entry
@@ -410,13 +390,11 @@ define i1 @is_plus_finite_f(float %x) {
; X86-FASTISEL-LABEL: is_plus_finite_f:
; X86-FASTISEL: # %bb.0: # %entry
; X86-FASTISEL-NEXT: pushl %eax
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 8
; X86-FASTISEL-NEXT: flds {{[0-9]+}}(%esp)
; X86-FASTISEL-NEXT: fstps (%esp)
; X86-FASTISEL-NEXT: cmpl $2139095040, (%esp) # imm = 0x7F800000
; X86-FASTISEL-NEXT: setb %al
; X86-FASTISEL-NEXT: popl %ecx
-; X86-FASTISEL-NEXT: .cfi_def_cfa_offset 4
; X86-FASTISEL-NEXT: retl
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 448) ; 0x1c0 = "+finite"
@@ -424,10 +402,10 @@ entry:
}
define i1 @isnone_d(double %x) nounwind {
-; X86-SDAGISEL-LABEL: isnone_d:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: xorl %eax, %eax
-; X86-SDAGISEL-NEXT: retl
+; X86-LABEL: isnone_d:
+; X86: # %bb.0: # %entry
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: retl
;
; X64-LABEL: isnone_d:
; X64: # %bb.0: # %entry
@@ -446,10 +424,10 @@ entry:
}
define i1 @isany_d(double %x) nounwind {
-; X86-SDAGISEL-LABEL: isany_d:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: movb $1, %al
-; X86-SDAGISEL-NEXT: retl
+; X86-LABEL: isany_d:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movb $1, %al
+; X86-NEXT: retl
;
; X64-LABEL: isany_d:
; X64: # %bb.0: # %entry
@@ -468,10 +446,10 @@ entry:
}
define i1 @isnone_f80(x86_fp80 %x) nounwind {
-; X86-SDAGISEL-LABEL: isnone_f80:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: xorl %eax, %eax
-; X86-SDAGISEL-NEXT: retl
+; X86-LABEL: isnone_f80:
+; X86: # %bb.0: # %entry
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: retl
;
; X64-SDAGISEL-LABEL: isnone_f80:
; X64-SDAGISEL: # %bb.0: # %entry
@@ -491,16 +469,21 @@ define i1 @isnone_f80(x86_fp80 %x) nounwind {
; X64-FASTISEL-NEXT: fstp %st(0)
; X64-FASTISEL-NEXT: xorl %eax, %eax
; X64-FASTISEL-NEXT: retq
+;
+; X64-GISEL-LABEL: isnone_f80:
+; X64-GISEL: # %bb.0: # %entry
+; X64-GISEL-NEXT: xorl %eax, %eax
+; X64-GISEL-NEXT: retq
entry:
%0 = tail call i1 @llvm.is.fpclass.f80(x86_fp80 %x, i32 0)
ret i1 %0
}
define i1 @isany_f80(x86_fp80 %x) nounwind {
-; X86-SDAGISEL-LABEL: isany_f80:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: movb $1, %al
-; X86-SDAGISEL-NEXT: retl
+; X86-LABEL: isany_f80:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movb $1, %al
+; X86-NEXT: retl
;
; X64-SDAGISEL-LABEL: isany_f80:
; X64-SDAGISEL: # %bb.0: # %entry
@@ -520,6 +503,11 @@ define i1 @isany_f80(x86_fp80 %x) nounwind {
; X64-FASTISEL-NEXT: fstp %st(0)
; X64-FASTISEL-NEXT: movb $1, %al
; X64-FASTISEL-NEXT: retq
+;
+; X64-GISEL-LABEL: isany_f80:
+; X64-GISEL: # %bb.0: # %entry
+; X64-GISEL-NEXT: movb $1, %al
+; X64-GISEL-NEXT: retq
entry:
%0 = tail call i1 @llvm.is.fpclass.f80(x86_fp80 %x, i32 1023)
ret i1 %0
diff --git a/llvm/test/CodeGen/X86/isel-smax.ll b/llvm/test/CodeGen/X86/isel-smax.ll
index 9c9a48e..1ce0a80 100644
--- a/llvm/test/CodeGen/X86/isel-smax.ll
+++ b/llvm/test/CodeGen/X86/isel-smax.ll
@@ -1,19 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X86
-; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64,DAG-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86,DAG-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86,FASTISEL-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86
define i8 @smax_i8(i8 %a, i8 %b) nounwind readnone {
-; X64-LABEL: smax_i8:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: cmpb %al, %dil
-; X64-NEXT: cmovgl %edi, %eax
-; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: retq
+; DAG-X64-LABEL: smax_i8:
+; DAG-X64: # %bb.0:
+; DAG-X64-NEXT: movl %esi, %eax
+; DAG-X64-NEXT: cmpb %al, %dil
+; DAG-X64-NEXT: cmovgl %edi, %eax
+; DAG-X64-NEXT: # kill: def $al killed $al killed $eax
+; DAG-X64-NEXT: retq
;
; FASTISEL-X64-LABEL: smax_i8:
; FASTISEL-X64: # %bb.0:
@@ -24,6 +24,17 @@ define i8 @smax_i8(i8 %a, i8 %b) nounwind readnone {
; FASTISEL-X64-NEXT: # kill: def $al killed $al killed $eax
; FASTISEL-X64-NEXT: retq
;
+; GISEL-X64-LABEL: smax_i8:
+; GISEL-X64: # %bb.0:
+; GISEL-X64-NEXT: movl %esi, %eax
+; GISEL-X64-NEXT: xorl %ecx, %ecx
+; GISEL-X64-NEXT: cmpb %al, %dil
+; GISEL-X64-NEXT: setg %cl
+; GISEL-X64-NEXT: andl $1, %ecx
+; GISEL-X64-NEXT: cmovnew %di, %ax
+; GISEL-X64-NEXT: # kill: def $al killed $al killed $eax
+; GISEL-X64-NEXT: retq
+;
; X86-LABEL: smax_i8:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
@@ -35,16 +46,20 @@ define i8 @smax_i8(i8 %a, i8 %b) nounwind readnone {
; X86-NEXT: .LBB0_2:
; X86-NEXT: retl
;
-; FASTISEL-X86-LABEL: smax_i8:
-; FASTISEL-X86: # %bb.0:
-; FASTISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; FASTISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT: cmpb %cl, %al
-; FASTISEL-X86-NEXT: jg .LBB0_2
-; FASTISEL-X86-NEXT: # %bb.1:
-; FASTISEL-X86-NEXT: movl %ecx, %eax
-; FASTISEL-X86-NEXT: .LBB0_2:
-; FASTISEL-X86-NEXT: retl
+; GISEL-X86-LABEL: smax_i8:
+; GISEL-X86: # %bb.0:
+; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT: xorl %edx, %edx
+; GISEL-X86-NEXT: cmpb %al, %cl
+; GISEL-X86-NEXT: setg %dl
+; GISEL-X86-NEXT: andl $1, %edx
+; GISEL-X86-NEXT: je .LBB0_2
+; GISEL-X86-NEXT: # %bb.1:
+; GISEL-X86-NEXT: movl %ecx, %eax
+; GISEL-X86-NEXT: .LBB0_2:
+; GISEL-X86-NEXT: # kill: def $al killed $al killed $eax
+; GISEL-X86-NEXT: retl
%ret = call i8 @llvm.smax.i8(i8 %a, i8 %b)
ret i8 %ret
}
@@ -57,25 +72,28 @@ define i16 @smax_i16(i16 %a, i16 %b) nounwind readnone {
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
-; FASTISEL-X64-LABEL: smax_i16:
-; FASTISEL-X64: # %bb.0:
-; FASTISEL-X64-NEXT: movl %esi, %eax
-; FASTISEL-X64-NEXT: cmpw %ax, %di
-; FASTISEL-X64-NEXT: cmovgl %edi, %eax
-; FASTISEL-X64-NEXT: # kill: def $ax killed $ax killed $eax
-; FASTISEL-X64-NEXT: retq
+; GISEL-X64-LABEL: smax_i16:
+; GISEL-X64: # %bb.0:
+; GISEL-X64-NEXT: movl %edi, %eax
+; GISEL-X64-NEXT: xorl %ecx, %ecx
+; GISEL-X64-NEXT: cmpw %si, %ax
+; GISEL-X64-NEXT: setg %cl
+; GISEL-X64-NEXT: andl $1, %ecx
+; GISEL-X64-NEXT: cmovew %si, %ax
+; GISEL-X64-NEXT: # kill: def $ax killed $ax killed $eax
+; GISEL-X64-NEXT: retq
;
-; X86-LABEL: smax_i16:
-; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpw %cx, %ax
-; X86-NEXT: jg .LBB1_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: .LBB1_2:
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-NEXT: retl
+; DAG-X86-LABEL: smax_i16:
+; DAG-X86: # %bb.0:
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; DAG-X86-NEXT: cmpw %cx, %ax
+; DAG-X86-NEXT: jg .LBB1_2
+; DAG-X86-NEXT: # %bb.1:
+; DAG-X86-NEXT: movl %ecx, %eax
+; DAG-X86-NEXT: .LBB1_2:
+; DAG-X86-NEXT: # kill: def $ax killed $ax killed $eax
+; DAG-X86-NEXT: retl
;
; FASTISEL-X86-LABEL: smax_i16:
; FASTISEL-X86: # %bb.0:
@@ -88,6 +106,21 @@ define i16 @smax_i16(i16 %a, i16 %b) nounwind readnone {
; FASTISEL-X86-NEXT: .LBB1_2:
; FASTISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax
; FASTISEL-X86-NEXT: retl
+;
+; GISEL-X86-LABEL: smax_i16:
+; GISEL-X86: # %bb.0:
+; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT: xorl %edx, %edx
+; GISEL-X86-NEXT: cmpw %ax, %cx
+; GISEL-X86-NEXT: setg %dl
+; GISEL-X86-NEXT: andl $1, %edx
+; GISEL-X86-NEXT: je .LBB1_2
+; GISEL-X86-NEXT: # %bb.1:
+; GISEL-X86-NEXT: movl %ecx, %eax
+; GISEL-X86-NEXT: .LBB1_2:
+; GISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax
+; GISEL-X86-NEXT: retl
%ret = call i16 @llvm.smax.i16(i16 %a, i16 %b)
ret i16 %ret
}
@@ -99,12 +132,15 @@ define i32 @smax_i32(i32 %a, i32 %b) nounwind readnone {
; X64-NEXT: cmovgl %edi, %eax
; X64-NEXT: retq
;
-; FASTISEL-X64-LABEL: smax_i32:
-; FASTISEL-X64: # %bb.0:
-; FASTISEL-X64-NEXT: movl %esi, %eax
-; FASTISEL-X64-NEXT: cmpl %esi, %edi
-; FASTISEL-X64-NEXT: cmovgl %edi, %eax
-; FASTISEL-X64-NEXT: retq
+; GISEL-X64-LABEL: smax_i32:
+; GISEL-X64: # %bb.0:
+; GISEL-X64-NEXT: movl %edi, %eax
+; GISEL-X64-NEXT: xorl %ecx, %ecx
+; GISEL-X64-NEXT: cmpl %esi, %edi
+; GISEL-X64-NEXT: setg %cl
+; GISEL-X64-NEXT: andl $1, %ecx
+; GISEL-X64-NEXT: cmovel %esi, %eax
+; GISEL-X64-NEXT: retq
;
; X86-LABEL: smax_i32:
; X86: # %bb.0:
@@ -117,16 +153,19 @@ define i32 @smax_i32(i32 %a, i32 %b) nounwind readnone {
; X86-NEXT: .LBB2_2:
; X86-NEXT: retl
;
-; FASTISEL-X86-LABEL: smax_i32:
-; FASTISEL-X86: # %bb.0:
-; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT: cmpl %ecx, %eax
-; FASTISEL-X86-NEXT: jg .LBB2_2
-; FASTISEL-X86-NEXT: # %bb.1:
-; FASTISEL-X86-NEXT: movl %ecx, %eax
-; FASTISEL-X86-NEXT: .LBB2_2:
-; FASTISEL-X86-NEXT: retl
+; GISEL-X86-LABEL: smax_i32:
+; GISEL-X86: # %bb.0:
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT: xorl %edx, %edx
+; GISEL-X86-NEXT: cmpl %eax, %ecx
+; GISEL-X86-NEXT: setg %dl
+; GISEL-X86-NEXT: andl $1, %edx
+; GISEL-X86-NEXT: je .LBB2_2
+; GISEL-X86-NEXT: # %bb.1:
+; GISEL-X86-NEXT: movl %ecx, %eax
+; GISEL-X86-NEXT: .LBB2_2:
+; GISEL-X86-NEXT: retl
%ret = call i32 @llvm.smax.i32(i32 %a, i32 %b)
ret i32 %ret
}
@@ -138,32 +177,35 @@ define i64 @smax_i64(i64 %a, i64 %b) nounwind readnone {
; X64-NEXT: cmovgq %rdi, %rax
; X64-NEXT: retq
;
-; FASTISEL-X64-LABEL: smax_i64:
-; FASTISEL-X64: # %bb.0:
-; FASTISEL-X64-NEXT: movq %rsi, %rax
-; FASTISEL-X64-NEXT: cmpq %rsi, %rdi
-; FASTISEL-X64-NEXT: cmovgq %rdi, %rax
-; FASTISEL-X64-NEXT: retq
+; GISEL-X64-LABEL: smax_i64:
+; GISEL-X64: # %bb.0:
+; GISEL-X64-NEXT: movq %rdi, %rax
+; GISEL-X64-NEXT: xorl %ecx, %ecx
+; GISEL-X64-NEXT: cmpq %rsi, %rdi
+; GISEL-X64-NEXT: setg %cl
+; GISEL-X64-NEXT: andl $1, %ecx
+; GISEL-X64-NEXT: cmoveq %rsi, %rax
+; GISEL-X64-NEXT: retq
;
-; X86-LABEL: smax_i64:
-; X86: # %bb.0:
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: cmpl %eax, %ecx
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: sbbl %edx, %edi
-; X86-NEXT: jl .LBB3_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: .LBB3_2:
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: retl
+; DAG-X86-LABEL: smax_i64:
+; DAG-X86: # %bb.0:
+; DAG-X86-NEXT: pushl %edi
+; DAG-X86-NEXT: pushl %esi
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; DAG-X86-NEXT: cmpl %eax, %ecx
+; DAG-X86-NEXT: movl %esi, %edi
+; DAG-X86-NEXT: sbbl %edx, %edi
+; DAG-X86-NEXT: jl .LBB3_2
+; DAG-X86-NEXT: # %bb.1:
+; DAG-X86-NEXT: movl %ecx, %eax
+; DAG-X86-NEXT: movl %esi, %edx
+; DAG-X86-NEXT: .LBB3_2:
+; DAG-X86-NEXT: popl %esi
+; DAG-X86-NEXT: popl %edi
+; DAG-X86-NEXT: retl
;
; FASTISEL-X86-LABEL: smax_i64:
; FASTISEL-X86: # %bb.0:
@@ -184,6 +226,44 @@ define i64 @smax_i64(i64 %a, i64 %b) nounwind readnone {
; FASTISEL-X86-NEXT: popl %esi
; FASTISEL-X86-NEXT: popl %edi
; FASTISEL-X86-NEXT: retl
+;
+; GISEL-X86-LABEL: smax_i64:
+; GISEL-X86: # %bb.0:
+; GISEL-X86-NEXT: pushl %ebp
+; GISEL-X86-NEXT: pushl %ebx
+; GISEL-X86-NEXT: pushl %edi
+; GISEL-X86-NEXT: pushl %esi
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; GISEL-X86-NEXT: cmpl %eax, %esi
+; GISEL-X86-NEXT: seta %bl
+; GISEL-X86-NEXT: xorl %ecx, %ecx
+; GISEL-X86-NEXT: cmpl %edx, %ebp
+; GISEL-X86-NEXT: setg %bh
+; GISEL-X86-NEXT: sete %cl
+; GISEL-X86-NEXT: testl %ecx, %ecx
+; GISEL-X86-NEXT: je .LBB3_2
+; GISEL-X86-NEXT: # %bb.1:
+; GISEL-X86-NEXT: movb %bl, %bh
+; GISEL-X86-NEXT: .LBB3_2:
+; GISEL-X86-NEXT: movzbl %bh, %edi
+; GISEL-X86-NEXT: andl $1, %edi
+; GISEL-X86-NEXT: je .LBB3_4
+; GISEL-X86-NEXT: # %bb.3:
+; GISEL-X86-NEXT: movl %esi, %eax
+; GISEL-X86-NEXT: .LBB3_4:
+; GISEL-X86-NEXT: testl %edi, %edi
+; GISEL-X86-NEXT: je .LBB3_6
+; GISEL-X86-NEXT: # %bb.5:
+; GISEL-X86-NEXT: movl %ebp, %edx
+; GISEL-X86-NEXT: .LBB3_6:
+; GISEL-X86-NEXT: popl %esi
+; GISEL-X86-NEXT: popl %edi
+; GISEL-X86-NEXT: popl %ebx
+; GISEL-X86-NEXT: popl %ebp
+; GISEL-X86-NEXT: retl
%ret = call i64 @llvm.smax.i64(i64 %a, i64 %b)
ret i64 %ret
}
diff --git a/llvm/test/CodeGen/X86/isel-smin.ll b/llvm/test/CodeGen/X86/isel-smin.ll
index 7349a7c..bbed3c3 100644
--- a/llvm/test/CodeGen/X86/isel-smin.ll
+++ b/llvm/test/CodeGen/X86/isel-smin.ll
@@ -1,19 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X86
-; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64,DAG-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86,DAG-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86,FASTISEL-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86
define i8 @smin_i8(i8 %a, i8 %b) nounwind readnone {
-; X64-LABEL: smin_i8:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: cmpb %al, %dil
-; X64-NEXT: cmovll %edi, %eax
-; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: retq
+; DAG-X64-LABEL: smin_i8:
+; DAG-X64: # %bb.0:
+; DAG-X64-NEXT: movl %esi, %eax
+; DAG-X64-NEXT: cmpb %al, %dil
+; DAG-X64-NEXT: cmovll %edi, %eax
+; DAG-X64-NEXT: # kill: def $al killed $al killed $eax
+; DAG-X64-NEXT: retq
;
; FASTISEL-X64-LABEL: smin_i8:
; FASTISEL-X64: # %bb.0:
@@ -24,6 +24,17 @@ define i8 @smin_i8(i8 %a, i8 %b) nounwind readnone {
; FASTISEL-X64-NEXT: # kill: def $al killed $al killed $eax
; FASTISEL-X64-NEXT: retq
;
+; GISEL-X64-LABEL: smin_i8:
+; GISEL-X64: # %bb.0:
+; GISEL-X64-NEXT: movl %esi, %eax
+; GISEL-X64-NEXT: xorl %ecx, %ecx
+; GISEL-X64-NEXT: cmpb %al, %dil
+; GISEL-X64-NEXT: setl %cl
+; GISEL-X64-NEXT: andl $1, %ecx
+; GISEL-X64-NEXT: cmovnew %di, %ax
+; GISEL-X64-NEXT: # kill: def $al killed $al killed $eax
+; GISEL-X64-NEXT: retq
+;
; X86-LABEL: smin_i8:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
@@ -35,16 +46,20 @@ define i8 @smin_i8(i8 %a, i8 %b) nounwind readnone {
; X86-NEXT: .LBB0_2:
; X86-NEXT: retl
;
-; FASTISEL-X86-LABEL: smin_i8:
-; FASTISEL-X86: # %bb.0:
-; FASTISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; FASTISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT: cmpb %cl, %al
-; FASTISEL-X86-NEXT: jl .LBB0_2
-; FASTISEL-X86-NEXT: # %bb.1:
-; FASTISEL-X86-NEXT: movl %ecx, %eax
-; FASTISEL-X86-NEXT: .LBB0_2:
-; FASTISEL-X86-NEXT: retl
+; GISEL-X86-LABEL: smin_i8:
+; GISEL-X86: # %bb.0:
+; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT: xorl %edx, %edx
+; GISEL-X86-NEXT: cmpb %al, %cl
+; GISEL-X86-NEXT: setl %dl
+; GISEL-X86-NEXT: andl $1, %edx
+; GISEL-X86-NEXT: je .LBB0_2
+; GISEL-X86-NEXT: # %bb.1:
+; GISEL-X86-NEXT: movl %ecx, %eax
+; GISEL-X86-NEXT: .LBB0_2:
+; GISEL-X86-NEXT: # kill: def $al killed $al killed $eax
+; GISEL-X86-NEXT: retl
%ret = call i8 @llvm.smin.i8(i8 %a, i8 %b)
ret i8 %ret
}
@@ -57,25 +72,28 @@ define i16 @smin_i16(i16 %a, i16 %b) nounwind readnone {
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
-; FASTISEL-X64-LABEL: smin_i16:
-; FASTISEL-X64: # %bb.0:
-; FASTISEL-X64-NEXT: movl %esi, %eax
-; FASTISEL-X64-NEXT: cmpw %ax, %di
-; FASTISEL-X64-NEXT: cmovll %edi, %eax
-; FASTISEL-X64-NEXT: # kill: def $ax killed $ax killed $eax
-; FASTISEL-X64-NEXT: retq
+; GISEL-X64-LABEL: smin_i16:
+; GISEL-X64: # %bb.0:
+; GISEL-X64-NEXT: movl %edi, %eax
+; GISEL-X64-NEXT: xorl %ecx, %ecx
+; GISEL-X64-NEXT: cmpw %si, %ax
+; GISEL-X64-NEXT: setl %cl
+; GISEL-X64-NEXT: andl $1, %ecx
+; GISEL-X64-NEXT: cmovew %si, %ax
+; GISEL-X64-NEXT: # kill: def $ax killed $ax killed $eax
+; GISEL-X64-NEXT: retq
;
-; X86-LABEL: smin_i16:
-; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpw %cx, %ax
-; X86-NEXT: jl .LBB1_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: .LBB1_2:
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-NEXT: retl
+; DAG-X86-LABEL: smin_i16:
+; DAG-X86: # %bb.0:
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; DAG-X86-NEXT: cmpw %cx, %ax
+; DAG-X86-NEXT: jl .LBB1_2
+; DAG-X86-NEXT: # %bb.1:
+; DAG-X86-NEXT: movl %ecx, %eax
+; DAG-X86-NEXT: .LBB1_2:
+; DAG-X86-NEXT: # kill: def $ax killed $ax killed $eax
+; DAG-X86-NEXT: retl
;
; FASTISEL-X86-LABEL: smin_i16:
; FASTISEL-X86: # %bb.0:
@@ -88,6 +106,21 @@ define i16 @smin_i16(i16 %a, i16 %b) nounwind readnone {
; FASTISEL-X86-NEXT: .LBB1_2:
; FASTISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax
; FASTISEL-X86-NEXT: retl
+;
+; GISEL-X86-LABEL: smin_i16:
+; GISEL-X86: # %bb.0:
+; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT: xorl %edx, %edx
+; GISEL-X86-NEXT: cmpw %ax, %cx
+; GISEL-X86-NEXT: setl %dl
+; GISEL-X86-NEXT: andl $1, %edx
+; GISEL-X86-NEXT: je .LBB1_2
+; GISEL-X86-NEXT: # %bb.1:
+; GISEL-X86-NEXT: movl %ecx, %eax
+; GISEL-X86-NEXT: .LBB1_2:
+; GISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax
+; GISEL-X86-NEXT: retl
%ret = call i16 @llvm.smin.i16(i16 %a, i16 %b)
ret i16 %ret
}
@@ -99,12 +132,15 @@ define i32 @smin_i32(i32 %a, i32 %b) nounwind readnone {
; X64-NEXT: cmovll %edi, %eax
; X64-NEXT: retq
;
-; FASTISEL-X64-LABEL: smin_i32:
-; FASTISEL-X64: # %bb.0:
-; FASTISEL-X64-NEXT: movl %esi, %eax
-; FASTISEL-X64-NEXT: cmpl %esi, %edi
-; FASTISEL-X64-NEXT: cmovll %edi, %eax
-; FASTISEL-X64-NEXT: retq
+; GISEL-X64-LABEL: smin_i32:
+; GISEL-X64: # %bb.0:
+; GISEL-X64-NEXT: movl %edi, %eax
+; GISEL-X64-NEXT: xorl %ecx, %ecx
+; GISEL-X64-NEXT: cmpl %esi, %edi
+; GISEL-X64-NEXT: setl %cl
+; GISEL-X64-NEXT: andl $1, %ecx
+; GISEL-X64-NEXT: cmovel %esi, %eax
+; GISEL-X64-NEXT: retq
;
; X86-LABEL: smin_i32:
; X86: # %bb.0:
@@ -117,16 +153,19 @@ define i32 @smin_i32(i32 %a, i32 %b) nounwind readnone {
; X86-NEXT: .LBB2_2:
; X86-NEXT: retl
;
-; FASTISEL-X86-LABEL: smin_i32:
-; FASTISEL-X86: # %bb.0:
-; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT: cmpl %ecx, %eax
-; FASTISEL-X86-NEXT: jl .LBB2_2
-; FASTISEL-X86-NEXT: # %bb.1:
-; FASTISEL-X86-NEXT: movl %ecx, %eax
-; FASTISEL-X86-NEXT: .LBB2_2:
-; FASTISEL-X86-NEXT: retl
+; GISEL-X86-LABEL: smin_i32:
+; GISEL-X86: # %bb.0:
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT: xorl %edx, %edx
+; GISEL-X86-NEXT: cmpl %eax, %ecx
+; GISEL-X86-NEXT: setl %dl
+; GISEL-X86-NEXT: andl $1, %edx
+; GISEL-X86-NEXT: je .LBB2_2
+; GISEL-X86-NEXT: # %bb.1:
+; GISEL-X86-NEXT: movl %ecx, %eax
+; GISEL-X86-NEXT: .LBB2_2:
+; GISEL-X86-NEXT: retl
%ret = call i32 @llvm.smin.i32(i32 %a, i32 %b)
ret i32 %ret
}
@@ -138,32 +177,35 @@ define i64 @smin_i64(i64 %a, i64 %b) nounwind readnone {
; X64-NEXT: cmovlq %rdi, %rax
; X64-NEXT: retq
;
-; FASTISEL-X64-LABEL: smin_i64:
-; FASTISEL-X64: # %bb.0:
-; FASTISEL-X64-NEXT: movq %rsi, %rax
-; FASTISEL-X64-NEXT: cmpq %rsi, %rdi
-; FASTISEL-X64-NEXT: cmovlq %rdi, %rax
-; FASTISEL-X64-NEXT: retq
+; GISEL-X64-LABEL: smin_i64:
+; GISEL-X64: # %bb.0:
+; GISEL-X64-NEXT: movq %rdi, %rax
+; GISEL-X64-NEXT: xorl %ecx, %ecx
+; GISEL-X64-NEXT: cmpq %rsi, %rdi
+; GISEL-X64-NEXT: setl %cl
+; GISEL-X64-NEXT: andl $1, %ecx
+; GISEL-X64-NEXT: cmoveq %rsi, %rax
+; GISEL-X64-NEXT: retq
;
-; X86-LABEL: smin_i64:
-; X86: # %bb.0:
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: cmpl %ecx, %eax
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: sbbl %esi, %edi
-; X86-NEXT: jl .LBB3_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: .LBB3_2:
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: retl
+; DAG-X86-LABEL: smin_i64:
+; DAG-X86: # %bb.0:
+; DAG-X86-NEXT: pushl %edi
+; DAG-X86-NEXT: pushl %esi
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; DAG-X86-NEXT: cmpl %ecx, %eax
+; DAG-X86-NEXT: movl %edx, %edi
+; DAG-X86-NEXT: sbbl %esi, %edi
+; DAG-X86-NEXT: jl .LBB3_2
+; DAG-X86-NEXT: # %bb.1:
+; DAG-X86-NEXT: movl %ecx, %eax
+; DAG-X86-NEXT: movl %esi, %edx
+; DAG-X86-NEXT: .LBB3_2:
+; DAG-X86-NEXT: popl %esi
+; DAG-X86-NEXT: popl %edi
+; DAG-X86-NEXT: retl
;
; FASTISEL-X86-LABEL: smin_i64:
; FASTISEL-X86: # %bb.0:
@@ -184,6 +226,44 @@ define i64 @smin_i64(i64 %a, i64 %b) nounwind readnone {
; FASTISEL-X86-NEXT: popl %esi
; FASTISEL-X86-NEXT: popl %edi
; FASTISEL-X86-NEXT: retl
+;
+; GISEL-X86-LABEL: smin_i64:
+; GISEL-X86: # %bb.0:
+; GISEL-X86-NEXT: pushl %ebp
+; GISEL-X86-NEXT: pushl %ebx
+; GISEL-X86-NEXT: pushl %edi
+; GISEL-X86-NEXT: pushl %esi
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; GISEL-X86-NEXT: cmpl %eax, %esi
+; GISEL-X86-NEXT: setb %bl
+; GISEL-X86-NEXT: xorl %ecx, %ecx
+; GISEL-X86-NEXT: cmpl %edx, %ebp
+; GISEL-X86-NEXT: setl %bh
+; GISEL-X86-NEXT: sete %cl
+; GISEL-X86-NEXT: testl %ecx, %ecx
+; GISEL-X86-NEXT: je .LBB3_2
+; GISEL-X86-NEXT: # %bb.1:
+; GISEL-X86-NEXT: movb %bl, %bh
+; GISEL-X86-NEXT: .LBB3_2:
+; GISEL-X86-NEXT: movzbl %bh, %edi
+; GISEL-X86-NEXT: andl $1, %edi
+; GISEL-X86-NEXT: je .LBB3_4
+; GISEL-X86-NEXT: # %bb.3:
+; GISEL-X86-NEXT: movl %esi, %eax
+; GISEL-X86-NEXT: .LBB3_4:
+; GISEL-X86-NEXT: testl %edi, %edi
+; GISEL-X86-NEXT: je .LBB3_6
+; GISEL-X86-NEXT: # %bb.5:
+; GISEL-X86-NEXT: movl %ebp, %edx
+; GISEL-X86-NEXT: .LBB3_6:
+; GISEL-X86-NEXT: popl %esi
+; GISEL-X86-NEXT: popl %edi
+; GISEL-X86-NEXT: popl %ebx
+; GISEL-X86-NEXT: popl %ebp
+; GISEL-X86-NEXT: retl
%ret = call i64 @llvm.smin.i64(i64 %a, i64 %b)
ret i64 %ret
}
diff --git a/llvm/test/CodeGen/X86/isel-umax.ll b/llvm/test/CodeGen/X86/isel-umax.ll
index a90456c..990af26 100644
--- a/llvm/test/CodeGen/X86/isel-umax.ll
+++ b/llvm/test/CodeGen/X86/isel-umax.ll
@@ -1,19 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X86
-; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64,DAG-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86,DAG-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86,FASTISEL-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86
define i8 @umax_i8(i8 %a, i8 %b) nounwind readnone {
-; X64-LABEL: umax_i8:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: cmpb %al, %dil
-; X64-NEXT: cmoval %edi, %eax
-; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: retq
+; DAG-X64-LABEL: umax_i8:
+; DAG-X64: # %bb.0:
+; DAG-X64-NEXT: movl %esi, %eax
+; DAG-X64-NEXT: cmpb %al, %dil
+; DAG-X64-NEXT: cmoval %edi, %eax
+; DAG-X64-NEXT: # kill: def $al killed $al killed $eax
+; DAG-X64-NEXT: retq
;
; FASTISEL-X64-LABEL: umax_i8:
; FASTISEL-X64: # %bb.0:
@@ -24,6 +24,17 @@ define i8 @umax_i8(i8 %a, i8 %b) nounwind readnone {
; FASTISEL-X64-NEXT: # kill: def $al killed $al killed $eax
; FASTISEL-X64-NEXT: retq
;
+; GISEL-X64-LABEL: umax_i8:
+; GISEL-X64: # %bb.0:
+; GISEL-X64-NEXT: movl %esi, %eax
+; GISEL-X64-NEXT: xorl %ecx, %ecx
+; GISEL-X64-NEXT: cmpb %al, %dil
+; GISEL-X64-NEXT: seta %cl
+; GISEL-X64-NEXT: andl $1, %ecx
+; GISEL-X64-NEXT: cmovnew %di, %ax
+; GISEL-X64-NEXT: # kill: def $al killed $al killed $eax
+; GISEL-X64-NEXT: retq
+;
; X86-LABEL: umax_i8:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
@@ -35,16 +46,20 @@ define i8 @umax_i8(i8 %a, i8 %b) nounwind readnone {
; X86-NEXT: .LBB0_2:
; X86-NEXT: retl
;
-; FASTISEL-X86-LABEL: umax_i8:
-; FASTISEL-X86: # %bb.0:
-; FASTISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; FASTISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT: cmpb %cl, %al
-; FASTISEL-X86-NEXT: ja .LBB0_2
-; FASTISEL-X86-NEXT: # %bb.1:
-; FASTISEL-X86-NEXT: movl %ecx, %eax
-; FASTISEL-X86-NEXT: .LBB0_2:
-; FASTISEL-X86-NEXT: retl
+; GISEL-X86-LABEL: umax_i8:
+; GISEL-X86: # %bb.0:
+; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT: xorl %edx, %edx
+; GISEL-X86-NEXT: cmpb %al, %cl
+; GISEL-X86-NEXT: seta %dl
+; GISEL-X86-NEXT: andl $1, %edx
+; GISEL-X86-NEXT: je .LBB0_2
+; GISEL-X86-NEXT: # %bb.1:
+; GISEL-X86-NEXT: movl %ecx, %eax
+; GISEL-X86-NEXT: .LBB0_2:
+; GISEL-X86-NEXT: # kill: def $al killed $al killed $eax
+; GISEL-X86-NEXT: retl
%ret = call i8 @llvm.umax.i8(i8 %a, i8 %b)
ret i8 %ret
}
@@ -57,25 +72,28 @@ define i16 @umax_i16(i16 %a, i16 %b) nounwind readnone {
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
-; FASTISEL-X64-LABEL: umax_i16:
-; FASTISEL-X64: # %bb.0:
-; FASTISEL-X64-NEXT: movl %esi, %eax
-; FASTISEL-X64-NEXT: cmpw %ax, %di
-; FASTISEL-X64-NEXT: cmoval %edi, %eax
-; FASTISEL-X64-NEXT: # kill: def $ax killed $ax killed $eax
-; FASTISEL-X64-NEXT: retq
+; GISEL-X64-LABEL: umax_i16:
+; GISEL-X64: # %bb.0:
+; GISEL-X64-NEXT: movl %edi, %eax
+; GISEL-X64-NEXT: xorl %ecx, %ecx
+; GISEL-X64-NEXT: cmpw %si, %ax
+; GISEL-X64-NEXT: seta %cl
+; GISEL-X64-NEXT: andl $1, %ecx
+; GISEL-X64-NEXT: cmovew %si, %ax
+; GISEL-X64-NEXT: # kill: def $ax killed $ax killed $eax
+; GISEL-X64-NEXT: retq
;
-; X86-LABEL: umax_i16:
-; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpw %cx, %ax
-; X86-NEXT: ja .LBB1_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: .LBB1_2:
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-NEXT: retl
+; DAG-X86-LABEL: umax_i16:
+; DAG-X86: # %bb.0:
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; DAG-X86-NEXT: cmpw %cx, %ax
+; DAG-X86-NEXT: ja .LBB1_2
+; DAG-X86-NEXT: # %bb.1:
+; DAG-X86-NEXT: movl %ecx, %eax
+; DAG-X86-NEXT: .LBB1_2:
+; DAG-X86-NEXT: # kill: def $ax killed $ax killed $eax
+; DAG-X86-NEXT: retl
;
; FASTISEL-X86-LABEL: umax_i16:
; FASTISEL-X86: # %bb.0:
@@ -88,6 +106,21 @@ define i16 @umax_i16(i16 %a, i16 %b) nounwind readnone {
; FASTISEL-X86-NEXT: .LBB1_2:
; FASTISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax
; FASTISEL-X86-NEXT: retl
+;
+; GISEL-X86-LABEL: umax_i16:
+; GISEL-X86: # %bb.0:
+; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT: xorl %edx, %edx
+; GISEL-X86-NEXT: cmpw %ax, %cx
+; GISEL-X86-NEXT: seta %dl
+; GISEL-X86-NEXT: andl $1, %edx
+; GISEL-X86-NEXT: je .LBB1_2
+; GISEL-X86-NEXT: # %bb.1:
+; GISEL-X86-NEXT: movl %ecx, %eax
+; GISEL-X86-NEXT: .LBB1_2:
+; GISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax
+; GISEL-X86-NEXT: retl
%ret = call i16 @llvm.umax.i16(i16 %a, i16 %b)
ret i16 %ret
}
@@ -99,12 +132,15 @@ define i32 @umax_i32(i32 %a, i32 %b) nounwind readnone {
; X64-NEXT: cmoval %edi, %eax
; X64-NEXT: retq
;
-; FASTISEL-X64-LABEL: umax_i32:
-; FASTISEL-X64: # %bb.0:
-; FASTISEL-X64-NEXT: movl %esi, %eax
-; FASTISEL-X64-NEXT: cmpl %esi, %edi
-; FASTISEL-X64-NEXT: cmoval %edi, %eax
-; FASTISEL-X64-NEXT: retq
+; GISEL-X64-LABEL: umax_i32:
+; GISEL-X64: # %bb.0:
+; GISEL-X64-NEXT: movl %edi, %eax
+; GISEL-X64-NEXT: xorl %ecx, %ecx
+; GISEL-X64-NEXT: cmpl %esi, %edi
+; GISEL-X64-NEXT: seta %cl
+; GISEL-X64-NEXT: andl $1, %ecx
+; GISEL-X64-NEXT: cmovel %esi, %eax
+; GISEL-X64-NEXT: retq
;
; X86-LABEL: umax_i32:
; X86: # %bb.0:
@@ -117,16 +153,19 @@ define i32 @umax_i32(i32 %a, i32 %b) nounwind readnone {
; X86-NEXT: .LBB2_2:
; X86-NEXT: retl
;
-; FASTISEL-X86-LABEL: umax_i32:
-; FASTISEL-X86: # %bb.0:
-; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT: cmpl %ecx, %eax
-; FASTISEL-X86-NEXT: ja .LBB2_2
-; FASTISEL-X86-NEXT: # %bb.1:
-; FASTISEL-X86-NEXT: movl %ecx, %eax
-; FASTISEL-X86-NEXT: .LBB2_2:
-; FASTISEL-X86-NEXT: retl
+; GISEL-X86-LABEL: umax_i32:
+; GISEL-X86: # %bb.0:
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT: xorl %edx, %edx
+; GISEL-X86-NEXT: cmpl %eax, %ecx
+; GISEL-X86-NEXT: seta %dl
+; GISEL-X86-NEXT: andl $1, %edx
+; GISEL-X86-NEXT: je .LBB2_2
+; GISEL-X86-NEXT: # %bb.1:
+; GISEL-X86-NEXT: movl %ecx, %eax
+; GISEL-X86-NEXT: .LBB2_2:
+; GISEL-X86-NEXT: retl
%ret = call i32 @llvm.umax.i32(i32 %a, i32 %b)
ret i32 %ret
}
@@ -138,32 +177,35 @@ define i64 @umax_i64(i64 %a, i64 %b) nounwind readnone {
; X64-NEXT: cmovaq %rdi, %rax
; X64-NEXT: retq
;
-; FASTISEL-X64-LABEL: umax_i64:
-; FASTISEL-X64: # %bb.0:
-; FASTISEL-X64-NEXT: movq %rsi, %rax
-; FASTISEL-X64-NEXT: cmpq %rsi, %rdi
-; FASTISEL-X64-NEXT: cmovaq %rdi, %rax
-; FASTISEL-X64-NEXT: retq
+; GISEL-X64-LABEL: umax_i64:
+; GISEL-X64: # %bb.0:
+; GISEL-X64-NEXT: movq %rdi, %rax
+; GISEL-X64-NEXT: xorl %ecx, %ecx
+; GISEL-X64-NEXT: cmpq %rsi, %rdi
+; GISEL-X64-NEXT: seta %cl
+; GISEL-X64-NEXT: andl $1, %ecx
+; GISEL-X64-NEXT: cmoveq %rsi, %rax
+; GISEL-X64-NEXT: retq
;
-; X86-LABEL: umax_i64:
-; X86: # %bb.0:
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: cmpl %eax, %ecx
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: sbbl %edx, %edi
-; X86-NEXT: jb .LBB3_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: .LBB3_2:
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: retl
+; DAG-X86-LABEL: umax_i64:
+; DAG-X86: # %bb.0:
+; DAG-X86-NEXT: pushl %edi
+; DAG-X86-NEXT: pushl %esi
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; DAG-X86-NEXT: cmpl %eax, %ecx
+; DAG-X86-NEXT: movl %esi, %edi
+; DAG-X86-NEXT: sbbl %edx, %edi
+; DAG-X86-NEXT: jb .LBB3_2
+; DAG-X86-NEXT: # %bb.1:
+; DAG-X86-NEXT: movl %ecx, %eax
+; DAG-X86-NEXT: movl %esi, %edx
+; DAG-X86-NEXT: .LBB3_2:
+; DAG-X86-NEXT: popl %esi
+; DAG-X86-NEXT: popl %edi
+; DAG-X86-NEXT: retl
;
; FASTISEL-X86-LABEL: umax_i64:
; FASTISEL-X86: # %bb.0:
@@ -184,6 +226,44 @@ define i64 @umax_i64(i64 %a, i64 %b) nounwind readnone {
; FASTISEL-X86-NEXT: popl %esi
; FASTISEL-X86-NEXT: popl %edi
; FASTISEL-X86-NEXT: retl
+;
+; GISEL-X86-LABEL: umax_i64:
+; GISEL-X86: # %bb.0:
+; GISEL-X86-NEXT: pushl %ebp
+; GISEL-X86-NEXT: pushl %ebx
+; GISEL-X86-NEXT: pushl %edi
+; GISEL-X86-NEXT: pushl %esi
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; GISEL-X86-NEXT: cmpl %eax, %esi
+; GISEL-X86-NEXT: seta %bl
+; GISEL-X86-NEXT: xorl %ecx, %ecx
+; GISEL-X86-NEXT: cmpl %edx, %ebp
+; GISEL-X86-NEXT: seta %bh
+; GISEL-X86-NEXT: sete %cl
+; GISEL-X86-NEXT: testl %ecx, %ecx
+; GISEL-X86-NEXT: je .LBB3_2
+; GISEL-X86-NEXT: # %bb.1:
+; GISEL-X86-NEXT: movb %bl, %bh
+; GISEL-X86-NEXT: .LBB3_2:
+; GISEL-X86-NEXT: movzbl %bh, %edi
+; GISEL-X86-NEXT: andl $1, %edi
+; GISEL-X86-NEXT: je .LBB3_4
+; GISEL-X86-NEXT: # %bb.3:
+; GISEL-X86-NEXT: movl %esi, %eax
+; GISEL-X86-NEXT: .LBB3_4:
+; GISEL-X86-NEXT: testl %edi, %edi
+; GISEL-X86-NEXT: je .LBB3_6
+; GISEL-X86-NEXT: # %bb.5:
+; GISEL-X86-NEXT: movl %ebp, %edx
+; GISEL-X86-NEXT: .LBB3_6:
+; GISEL-X86-NEXT: popl %esi
+; GISEL-X86-NEXT: popl %edi
+; GISEL-X86-NEXT: popl %ebx
+; GISEL-X86-NEXT: popl %ebp
+; GISEL-X86-NEXT: retl
%ret = call i64 @llvm.umax.i64(i64 %a, i64 %b)
ret i64 %ret
}
diff --git a/llvm/test/CodeGen/X86/isel-umin.ll b/llvm/test/CodeGen/X86/isel-umin.ll
index 53a0b27..1710b9f 100644
--- a/llvm/test/CodeGen/X86/isel-umin.ll
+++ b/llvm/test/CodeGen/X86/isel-umin.ll
@@ -1,19 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X86
-; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64,DAG-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86,DAG-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86,FASTISEL-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86
define i8 @umin_i8(i8 %a, i8 %b) nounwind readnone {
-; X64-LABEL: umin_i8:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: cmpb %al, %dil
-; X64-NEXT: cmovbl %edi, %eax
-; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: retq
+; DAG-X64-LABEL: umin_i8:
+; DAG-X64: # %bb.0:
+; DAG-X64-NEXT: movl %esi, %eax
+; DAG-X64-NEXT: cmpb %al, %dil
+; DAG-X64-NEXT: cmovbl %edi, %eax
+; DAG-X64-NEXT: # kill: def $al killed $al killed $eax
+; DAG-X64-NEXT: retq
;
; FASTISEL-X64-LABEL: umin_i8:
; FASTISEL-X64: # %bb.0:
@@ -24,6 +24,17 @@ define i8 @umin_i8(i8 %a, i8 %b) nounwind readnone {
; FASTISEL-X64-NEXT: # kill: def $al killed $al killed $eax
; FASTISEL-X64-NEXT: retq
;
+; GISEL-X64-LABEL: umin_i8:
+; GISEL-X64: # %bb.0:
+; GISEL-X64-NEXT: movl %esi, %eax
+; GISEL-X64-NEXT: xorl %ecx, %ecx
+; GISEL-X64-NEXT: cmpb %al, %dil
+; GISEL-X64-NEXT: setb %cl
+; GISEL-X64-NEXT: andl $1, %ecx
+; GISEL-X64-NEXT: cmovnew %di, %ax
+; GISEL-X64-NEXT: # kill: def $al killed $al killed $eax
+; GISEL-X64-NEXT: retq
+;
; X86-LABEL: umin_i8:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
@@ -35,16 +46,20 @@ define i8 @umin_i8(i8 %a, i8 %b) nounwind readnone {
; X86-NEXT: .LBB0_2:
; X86-NEXT: retl
;
-; FASTISEL-X86-LABEL: umin_i8:
-; FASTISEL-X86: # %bb.0:
-; FASTISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; FASTISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT: cmpb %cl, %al
-; FASTISEL-X86-NEXT: jb .LBB0_2
-; FASTISEL-X86-NEXT: # %bb.1:
-; FASTISEL-X86-NEXT: movl %ecx, %eax
-; FASTISEL-X86-NEXT: .LBB0_2:
-; FASTISEL-X86-NEXT: retl
+; GISEL-X86-LABEL: umin_i8:
+; GISEL-X86: # %bb.0:
+; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT: xorl %edx, %edx
+; GISEL-X86-NEXT: cmpb %al, %cl
+; GISEL-X86-NEXT: setb %dl
+; GISEL-X86-NEXT: andl $1, %edx
+; GISEL-X86-NEXT: je .LBB0_2
+; GISEL-X86-NEXT: # %bb.1:
+; GISEL-X86-NEXT: movl %ecx, %eax
+; GISEL-X86-NEXT: .LBB0_2:
+; GISEL-X86-NEXT: # kill: def $al killed $al killed $eax
+; GISEL-X86-NEXT: retl
%ret = call i8 @llvm.umin.i8(i8 %a, i8 %b)
ret i8 %ret
}
@@ -57,25 +72,28 @@ define i16 @umin_i16(i16 %a, i16 %b) nounwind readnone {
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
-; FASTISEL-X64-LABEL: umin_i16:
-; FASTISEL-X64: # %bb.0:
-; FASTISEL-X64-NEXT: movl %esi, %eax
-; FASTISEL-X64-NEXT: cmpw %ax, %di
-; FASTISEL-X64-NEXT: cmovbl %edi, %eax
-; FASTISEL-X64-NEXT: # kill: def $ax killed $ax killed $eax
-; FASTISEL-X64-NEXT: retq
+; GISEL-X64-LABEL: umin_i16:
+; GISEL-X64: # %bb.0:
+; GISEL-X64-NEXT: movl %edi, %eax
+; GISEL-X64-NEXT: xorl %ecx, %ecx
+; GISEL-X64-NEXT: cmpw %si, %ax
+; GISEL-X64-NEXT: setb %cl
+; GISEL-X64-NEXT: andl $1, %ecx
+; GISEL-X64-NEXT: cmovew %si, %ax
+; GISEL-X64-NEXT: # kill: def $ax killed $ax killed $eax
+; GISEL-X64-NEXT: retq
;
-; X86-LABEL: umin_i16:
-; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpw %cx, %ax
-; X86-NEXT: jb .LBB1_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: .LBB1_2:
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-NEXT: retl
+; DAG-X86-LABEL: umin_i16:
+; DAG-X86: # %bb.0:
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; DAG-X86-NEXT: cmpw %cx, %ax
+; DAG-X86-NEXT: jb .LBB1_2
+; DAG-X86-NEXT: # %bb.1:
+; DAG-X86-NEXT: movl %ecx, %eax
+; DAG-X86-NEXT: .LBB1_2:
+; DAG-X86-NEXT: # kill: def $ax killed $ax killed $eax
+; DAG-X86-NEXT: retl
;
; FASTISEL-X86-LABEL: umin_i16:
; FASTISEL-X86: # %bb.0:
@@ -88,6 +106,21 @@ define i16 @umin_i16(i16 %a, i16 %b) nounwind readnone {
; FASTISEL-X86-NEXT: .LBB1_2:
; FASTISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax
; FASTISEL-X86-NEXT: retl
+;
+; GISEL-X86-LABEL: umin_i16:
+; GISEL-X86: # %bb.0:
+; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT: xorl %edx, %edx
+; GISEL-X86-NEXT: cmpw %ax, %cx
+; GISEL-X86-NEXT: setb %dl
+; GISEL-X86-NEXT: andl $1, %edx
+; GISEL-X86-NEXT: je .LBB1_2
+; GISEL-X86-NEXT: # %bb.1:
+; GISEL-X86-NEXT: movl %ecx, %eax
+; GISEL-X86-NEXT: .LBB1_2:
+; GISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax
+; GISEL-X86-NEXT: retl
%ret = call i16 @llvm.umin.i16(i16 %a, i16 %b)
ret i16 %ret
}
@@ -99,12 +132,15 @@ define i32 @umin_i32(i32 %a, i32 %b) nounwind readnone {
; X64-NEXT: cmovbl %edi, %eax
; X64-NEXT: retq
;
-; FASTISEL-X64-LABEL: umin_i32:
-; FASTISEL-X64: # %bb.0:
-; FASTISEL-X64-NEXT: movl %esi, %eax
-; FASTISEL-X64-NEXT: cmpl %esi, %edi
-; FASTISEL-X64-NEXT: cmovbl %edi, %eax
-; FASTISEL-X64-NEXT: retq
+; GISEL-X64-LABEL: umin_i32:
+; GISEL-X64: # %bb.0:
+; GISEL-X64-NEXT: movl %edi, %eax
+; GISEL-X64-NEXT: xorl %ecx, %ecx
+; GISEL-X64-NEXT: cmpl %esi, %edi
+; GISEL-X64-NEXT: setb %cl
+; GISEL-X64-NEXT: andl $1, %ecx
+; GISEL-X64-NEXT: cmovel %esi, %eax
+; GISEL-X64-NEXT: retq
;
; X86-LABEL: umin_i32:
; X86: # %bb.0:
@@ -117,16 +153,19 @@ define i32 @umin_i32(i32 %a, i32 %b) nounwind readnone {
; X86-NEXT: .LBB2_2:
; X86-NEXT: retl
;
-; FASTISEL-X86-LABEL: umin_i32:
-; FASTISEL-X86: # %bb.0:
-; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT: cmpl %ecx, %eax
-; FASTISEL-X86-NEXT: jb .LBB2_2
-; FASTISEL-X86-NEXT: # %bb.1:
-; FASTISEL-X86-NEXT: movl %ecx, %eax
-; FASTISEL-X86-NEXT: .LBB2_2:
-; FASTISEL-X86-NEXT: retl
+; GISEL-X86-LABEL: umin_i32:
+; GISEL-X86: # %bb.0:
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT: xorl %edx, %edx
+; GISEL-X86-NEXT: cmpl %eax, %ecx
+; GISEL-X86-NEXT: setb %dl
+; GISEL-X86-NEXT: andl $1, %edx
+; GISEL-X86-NEXT: je .LBB2_2
+; GISEL-X86-NEXT: # %bb.1:
+; GISEL-X86-NEXT: movl %ecx, %eax
+; GISEL-X86-NEXT: .LBB2_2:
+; GISEL-X86-NEXT: retl
%ret = call i32 @llvm.umin.i32(i32 %a, i32 %b)
ret i32 %ret
}
@@ -138,32 +177,35 @@ define i64 @umin_i64(i64 %a, i64 %b) nounwind readnone {
; X64-NEXT: cmovbq %rdi, %rax
; X64-NEXT: retq
;
-; FASTISEL-X64-LABEL: umin_i64:
-; FASTISEL-X64: # %bb.0:
-; FASTISEL-X64-NEXT: movq %rsi, %rax
-; FASTISEL-X64-NEXT: cmpq %rsi, %rdi
-; FASTISEL-X64-NEXT: cmovbq %rdi, %rax
-; FASTISEL-X64-NEXT: retq
+; GISEL-X64-LABEL: umin_i64:
+; GISEL-X64: # %bb.0:
+; GISEL-X64-NEXT: movq %rdi, %rax
+; GISEL-X64-NEXT: xorl %ecx, %ecx
+; GISEL-X64-NEXT: cmpq %rsi, %rdi
+; GISEL-X64-NEXT: setb %cl
+; GISEL-X64-NEXT: andl $1, %ecx
+; GISEL-X64-NEXT: cmoveq %rsi, %rax
+; GISEL-X64-NEXT: retq
;
-; X86-LABEL: umin_i64:
-; X86: # %bb.0:
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: cmpl %ecx, %eax
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: sbbl %esi, %edi
-; X86-NEXT: jb .LBB3_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: .LBB3_2:
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: retl
+; DAG-X86-LABEL: umin_i64:
+; DAG-X86: # %bb.0:
+; DAG-X86-NEXT: pushl %edi
+; DAG-X86-NEXT: pushl %esi
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; DAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; DAG-X86-NEXT: cmpl %ecx, %eax
+; DAG-X86-NEXT: movl %edx, %edi
+; DAG-X86-NEXT: sbbl %esi, %edi
+; DAG-X86-NEXT: jb .LBB3_2
+; DAG-X86-NEXT: # %bb.1:
+; DAG-X86-NEXT: movl %ecx, %eax
+; DAG-X86-NEXT: movl %esi, %edx
+; DAG-X86-NEXT: .LBB3_2:
+; DAG-X86-NEXT: popl %esi
+; DAG-X86-NEXT: popl %edi
+; DAG-X86-NEXT: retl
;
; FASTISEL-X86-LABEL: umin_i64:
; FASTISEL-X86: # %bb.0:
@@ -184,6 +226,44 @@ define i64 @umin_i64(i64 %a, i64 %b) nounwind readnone {
; FASTISEL-X86-NEXT: popl %esi
; FASTISEL-X86-NEXT: popl %edi
; FASTISEL-X86-NEXT: retl
+;
+; GISEL-X86-LABEL: umin_i64:
+; GISEL-X86: # %bb.0:
+; GISEL-X86-NEXT: pushl %ebp
+; GISEL-X86-NEXT: pushl %ebx
+; GISEL-X86-NEXT: pushl %edi
+; GISEL-X86-NEXT: pushl %esi
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; GISEL-X86-NEXT: cmpl %eax, %esi
+; GISEL-X86-NEXT: setb %bl
+; GISEL-X86-NEXT: xorl %ecx, %ecx
+; GISEL-X86-NEXT: cmpl %edx, %ebp
+; GISEL-X86-NEXT: setb %bh
+; GISEL-X86-NEXT: sete %cl
+; GISEL-X86-NEXT: testl %ecx, %ecx
+; GISEL-X86-NEXT: je .LBB3_2
+; GISEL-X86-NEXT: # %bb.1:
+; GISEL-X86-NEXT: movb %bl, %bh
+; GISEL-X86-NEXT: .LBB3_2:
+; GISEL-X86-NEXT: movzbl %bh, %edi
+; GISEL-X86-NEXT: andl $1, %edi
+; GISEL-X86-NEXT: je .LBB3_4
+; GISEL-X86-NEXT: # %bb.3:
+; GISEL-X86-NEXT: movl %esi, %eax
+; GISEL-X86-NEXT: .LBB3_4:
+; GISEL-X86-NEXT: testl %edi, %edi
+; GISEL-X86-NEXT: je .LBB3_6
+; GISEL-X86-NEXT: # %bb.5:
+; GISEL-X86-NEXT: movl %ebp, %edx
+; GISEL-X86-NEXT: .LBB3_6:
+; GISEL-X86-NEXT: popl %esi
+; GISEL-X86-NEXT: popl %edi
+; GISEL-X86-NEXT: popl %ebx
+; GISEL-X86-NEXT: popl %ebp
+; GISEL-X86-NEXT: retl
%ret = call i64 @llvm.umin.i64(i64 %a, i64 %b)
ret i64 %ret
}
diff --git a/llvm/test/CodeGen/X86/pr161693.ll b/llvm/test/CodeGen/X86/pr161693.ll
new file mode 100644
index 0000000..de8188f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr161693.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+
+define void @PR161693() #0 {
+; CHECK-LABEL: PR161693:
+; CHECK: # %bb.0: # %start
+; CHECK-NEXT: movzbl (%rax), %eax
+; CHECK-NEXT: andb $-33, %al
+; CHECK-NEXT: addb $-71, %al
+; CHECK-NEXT: .p2align 4
+; CHECK-NEXT: .LBB0_1: # %loop
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: cmpb $-6, %al
+; CHECK-NEXT: setb %cl
+; CHECK-NEXT: leal (%rcx,%rcx), %edx
+; CHECK-NEXT: orb %cl, %dl
+; CHECK-NEXT: leal (,%rdx,4), %ecx
+; CHECK-NEXT: orb %dl, %cl
+; CHECK-NEXT: je .LBB0_1
+; CHECK-NEXT: # %bb.2: # %exit
+; CHECK-NEXT: retq
+start:
+ br label %loop
+
+loop:
+ %.val.i.i89 = load <16 x i8>, ptr poison, align 1
+ %.not49.i = icmp ult <16 x i8> zeroinitializer, splat (i8 -10)
+ %i = and <16 x i8> %.val.i.i89, splat (i8 -33)
+ %i1 = add <16 x i8> %i, splat (i8 -71)
+ %.not51.i = icmp ult <16 x i8> %i1, splat (i8 -6)
+ %.not46.i = and <16 x i1> %.not49.i, %.not51.i
+ %i2 = bitcast <16 x i1> %.not46.i to i16
+ %_0.i = icmp eq i16 %i2, 0
+ br i1 %_0.i, label %loop, label %exit
+
+exit:
+ ret void
+}
+
+attributes #0 = { "target-features"="+soft-float" }
diff --git a/llvm/test/DebugInfo/symbolize-build-id.test b/llvm/test/DebugInfo/symbolize-build-id.test
index d63f43f..2620718 100644
--- a/llvm/test/DebugInfo/symbolize-build-id.test
+++ b/llvm/test/DebugInfo/symbolize-build-id.test
@@ -21,6 +21,7 @@ Sections:
Type: SHT_NOTE
Flags: [ SHF_ALLOC ]
Content: 040000000800000003000000474e5500abb50d82b6bdc861
+ AddressAlign: 4
ProgramHeaders:
- Type: PT_NOTE
Flags: [ PF_R ]
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
index 9296f04..ed76a28 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
@@ -22,7 +22,7 @@
# CHECK-OBJ: Contents of section .rodata:
# CHECK-OBJ: 0000 48310048 32004833 00 H1.H2.H3.
-# CHECK-LG: Starting link phase 1 for graph
+# CHECK-LG: Starting link phase 1
# CHECK-LG: section .rodata:
# CHECK-LG: block 0x0 size = 0x00000009, align = 1, alignment-offset = 0
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-0.s b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-0.s
new file mode 100644
index 0000000..557e403
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-0.s
@@ -0,0 +1,7 @@
+ .section __DATA,__data
+ .globl x
+ .p2align 2, 0x0
+x:
+ .long 0
+
+.subsections_via_symbols
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-1.s b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-1.s
new file mode 100644
index 0000000..711c8a0
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-1.s
@@ -0,0 +1,7 @@
+ .section __DATA,__data
+ .globl x
+ .p2align 2, 0x0
+x:
+ .long 1
+
+.subsections_via_symbols
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_universal_slice_selection.s b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_universal_slice_selection.s
new file mode 100644
index 0000000..c58f84e
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_universal_slice_selection.s
@@ -0,0 +1,32 @@
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=arm64e-apple-darwin -filetype=obj -o %t/main.o %s
+# RUN: llvm-mc -triple=arm64-apple-darwin -filetype=obj -o %t/x.arm64.o \
+# RUN: %S/Inputs/x-1.s
+# RUN: llvm-ar crs %t/libX.arm64.a %t/x.arm64.o
+# RUN: llvm-mc -triple=arm64e-apple-darwin -filetype=obj -o %t/x.arm64e.o \
+# RUN: %S/Inputs/x-0.s
+# RUN: llvm-ar crs %t/libX.arm64e.a %t/x.arm64e.o
+# RUN: llvm-lipo --create --output %t/libX.a %t/libX.arm64.a %t/libX.arm64e.a
+# RUN: llvm-jitlink -noexec -check=%s %t/main.o -L%t -lX
+#
+# Create a universal archive with two slices (arm64e, arm64) each containing
+# a definition of X: in arm64e X = 0, in arm64 X = 1.
+# Check that if we load an arm64e object file then we link the arm64e slice
+# of the archive by verifying that X = 0.
+#
+
+# jitlink-check: *{4}x = 0
+
+ .section __TEXT,__text,regular,pure_instructions
+ .globl _main
+ .p2align 2
+_main:
+ mov w0, #0
+ ret
+
+ .section __DATA,__data
+ .globl p
+p:
+ .quad x
+
+.subsections_via_symbols
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
index 2b5c9e3..5f6babf 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
@@ -102,7 +102,7 @@ p:
call o
.size p, .-p
-# CHECK: Link graph "{{.*}}" before copy-and-fixup:
+# CHECK: Link graph before copy-and-fixup:
# CHECK: section .text:
# CHECK: block 0x1000
# CHECK: symbols:
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
index 3bbfd55..c31250b 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
@@ -131,7 +131,7 @@ p:
call o
.size p, .-p
-# CHECK: Link graph "{{.*}}" before copy-and-fixup:
+# CHECK: Link graph before copy-and-fixup:
# CHECK: section .text:
# CHECK: block 0x1000
# CHECK: symbols:
diff --git a/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll b/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll
new file mode 100644
index 0000000..512ea37
--- /dev/null
+++ b/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=gvn -S -enable-gvn-memdep=true < %s | FileCheck %s
+; RUN: opt -passes=gvn -S -enable-gvn-memdep=false < %s | FileCheck %s --check-prefix=MEMDEPFALSE
+
+define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) {
+; CHECK-LABEL: @forward_binop_with_sel(
+; CHECK-NEXT: [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]])
+; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]]
+; CHECK-NEXT: ret <4 x float> [[TMP3]]
+;
+; MEMDEPFALSE-LABEL: @forward_binop_with_sel(
+; MEMDEPFALSE-NEXT: [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]])
+; MEMDEPFALSE-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; MEMDEPFALSE-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; MEMDEPFALSE-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; MEMDEPFALSE-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; MEMDEPFALSE-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]])
+; MEMDEPFALSE-NEXT: [[LOAD_1_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 1, <4 x i1> [[MASK]], <4 x float> [[PASSTHROUGH:%.*]])
+; MEMDEPFALSE-NEXT: ret <4 x float> [[LOAD_1_0]]
+;
+ %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %a, i32 %b)
+ %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ %gep.0.16 = getelementptr i8, ptr %0, i32 16
+ %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ %fmul = fmul <4 x float> %load.0.0, %load.0.16
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask)
+ %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> %passthrough)
+ ret <4 x float> %load.1.0
+}
diff --git a/llvm/test/Transforms/GVN/masked-load-store.ll b/llvm/test/Transforms/GVN/masked-load-store.ll
index 984a756..b112e99 100644
--- a/llvm/test/Transforms/GVN/masked-load-store.ll
+++ b/llvm/test/Transforms/GVN/masked-load-store.ll
@@ -36,6 +36,180 @@ define <128 x i8> @f1(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
ret <128 x i8> %v4
}
-declare <128 x i8> @llvm.masked.load.v128i8.p0(ptr, i32, <128 x i1>, <128 x i8>)
-declare void @llvm.masked.store.v128i8.p0(<128 x i8>, ptr, i32, <128 x i1>)
+define <4 x float> @forward_masked_load(ptr %0, ptr %1) {
+; CHECK-LABEL: @forward_masked_load(
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true))
+; CHECK-NEXT: ret <4 x float> [[TMP4]]
+;
+ %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
+ %load1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %load1, ptr %1, i32 1, <4 x i1> %mask)
+ %load2 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ ret <4 x float> %load2
+}
+
+define <4 x float> @forward_masked_load_arbitrary_mask(ptr %loc_a, ptr %loc_b, <4 x i1> %mask) {
+; CHECK-LABEL: @forward_masked_load_arbitrary_mask(
+; CHECK-NEXT: [[LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[LOC_A:%.*]], i32 1, <4 x i1> [[MASK:%.*]], <4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD1]], ptr [[LOC_B:%.*]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT: [[TMP1:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[LOAD1]], <4 x float> zeroinitializer
+; CHECK-NEXT: ret <4 x float> [[TMP1]]
+;
+ %load1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %loc_a, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %load1, ptr %loc_b, i32 1, <4 x i1> %mask)
+ %load2 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %loc_b, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ ret <4 x float> %load2
+}
+
+define <4 x float> @forward_binop_splat_i1_mask(ptr %0, ptr %1) {
+; CHECK-LABEL: @forward_binop_splat_i1_mask(
+; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
+; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
+; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true))
+; CHECK-NEXT: ret <4 x float> [[FMUL]]
+;
+ %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
+ %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ %gep.0.16 = getelementptr i8, ptr %0, i32 16
+ %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ %fmul = fmul <4 x float> %load.0.0, %load.0.16
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask)
+ %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ ret <4 x float> %load.1.0
+}
+
+define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) {
+; CHECK-LABEL: @forward_binop_with_sel(
+; CHECK-NEXT: [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]])
+; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]]
+; CHECK-NEXT: ret <4 x float> [[TMP3]]
+;
+ %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %a, i32 %b)
+ %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ %gep.0.16 = getelementptr i8, ptr %0, i32 16
+ %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ %fmul = fmul <4 x float> %load.0.0, %load.0.16
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask)
+ %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> %passthrough)
+ ret <4 x float> %load.1.0
+}
+
+define <vscale x 4 x float> @forward_masked_load_scalable(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
+; CHECK-LABEL: @forward_masked_load_scalable(
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[PASSTHROUGH]]
+; CHECK-NEXT: ret <vscale x 4 x float> [[TMP5]]
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+ %load1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load1, ptr %1, i32 1, <vscale x 4 x i1> %mask)
+ %load2 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough)
+ ret <vscale x 4 x float> %load2
+}
+define <vscale x 4 x float> @forward_masked_load_scalable_type_mismatch(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
+; CHECK-LABEL: @forward_masked_load_scalable_type_mismatch(
+; CHECK-NEXT: [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT: [[LOAD1:%.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x double> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> [[LOAD1]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT: [[LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD2]]
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+ %load1 = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x double> zeroinitializer)
+ call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> %load1, ptr %1, i32 1, <vscale x 4 x i1> %mask)
+ %load2 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough)
+ ret <vscale x 4 x float> %load2
+}
+
+define <vscale x 4 x float> @generate_sel_with_passthrough(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
+; CHECK-LABEL: @generate_sel_with_passthrough(
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]
+; CHECK-NEXT: ret <vscale x 4 x float> [[TMP5]]
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+ %load1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load1, ptr %1, i32 1, <vscale x 4 x i1> %mask)
+ %load2 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough)
+ ret <vscale x 4 x float> %load2
+}
+
+define <vscale x 4 x float> @forward_binop_with_sel_scalable(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
+; CHECK-LABEL: @forward_binop_with_sel_scalable(
+; CHECK-NEXT: [[MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT: [[TMP3:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[FMUL]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]
+; CHECK-NEXT: ret <vscale x 4 x float> [[TMP3]]
+;
+ %mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+ %load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %gep.0.16 = getelementptr i8, ptr %0, i32 16
+ %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask)
+ %load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough)
+ ret <vscale x 4 x float> %load.1.0
+}
+
+define <vscale x 4 x float> @load_mask_differs(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
+; CHECK-LABEL: @load_mask_differs(
+; CHECK-NEXT: [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+; CHECK-NEXT: [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]])
+; CHECK-NEXT: [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK1]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD_1_0]]
+;
+ %mask0 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+ %mask1 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+ %load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
+ %gep.0.16 = getelementptr i8, ptr %0, i32 16
+ %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
+ %fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask0)
+ %load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask1, <vscale x 4 x float> %passthrough)
+ ret <vscale x 4 x float> %load.1.0
+}
+
+define <vscale x 4 x float> @store_mask_differs(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
+; CHECK-LABEL: @store_mask_differs(
+; CHECK-NEXT: [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+; CHECK-NEXT: [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK1]])
+; CHECK-NEXT: [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD_1_0]]
+;
+ %mask0 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+ %mask1 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+ %load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
+ %gep.0.16 = getelementptr i8, ptr %0, i32 16
+ %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
+ %fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask1)
+ %load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> %passthrough)
+ ret <vscale x 4 x float> %load.1.0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index c3b0bc8..27ca414 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -86,7 +86,7 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 48
+; CHECK: Cost for VF 16: 41
; CHECK: LV: Selecting VF: 16
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index 229209e..5ae0839 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -204,37 +204,33 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
-; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT: [[TMP19]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]])
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-SVE: middle.block:
-; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
@@ -670,39 +666,35 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
-; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT: [[TMP20:%.*]] = mul nsw <vscale x 4 x i32> [[TMP14]], [[TMP15]]
-; CHECK-SVE-NEXT: [[TMP21]] = add <vscale x 4 x i32> [[TMP19]], [[TMP20]]
-; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
+; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]])
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-SVE: middle.block:
-; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]])
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
@@ -996,36 +988,32 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
-; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT: [[TMP18]] = add <vscale x 4 x i32> [[TMP17]], [[TMP15]]
-; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP5]])
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-SVE-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-SVE: middle.block:
-; CHECK-SVE-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
@@ -1140,32 +1128,28 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
-; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP12:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP11]]
-; CHECK-SVE-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[TMP12]]
-; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT: [[TMP2:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]])
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP3]])
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-SVE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK-SVE: middle.block:
-; CHECK-SVE-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE2]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
@@ -1277,36 +1261,32 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
-; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP16:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP15]]
-; CHECK-SVE-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT: [[TMP18]] = add <vscale x 4 x i32> [[TMP16]], [[TMP17]]
-; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-SVE-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK-SVE: middle.block:
-; CHECK-SVE-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index dd239c0..8ece59a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -81,7 +81,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]]
-; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
index 49e9989..09b41fb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
@@ -12,40 +12,40 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
+; CHECK-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 4
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT: [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT: [[TMP11:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP7:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
+; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
-; CHECK-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
-; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP12:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NEXT: [[TMP19:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP7]]
+; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP14]])
+; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP19]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]])
+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
; CHECK: scalar.ph:
@@ -62,8 +62,8 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-NOI8MM: vector.body:
; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
@@ -82,14 +82,14 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
; CHECK-NOI8MM-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
-; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
+; CHECK-NOI8MM-NEXT: [[TMP22]] = add <vscale x 8 x i32> [[TMP20]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT: [[TMP23]] = add <vscale x 8 x i32> [[TMP21]], [[VEC_PHI1]]
; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-NOI8MM: middle.block:
-; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NOI8MM-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP22]]
+; CHECK-NOI8MM-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
; CHECK-NOI8MM: scalar.ph:
@@ -123,40 +123,40 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
+; CHECK-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 4
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP7:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
+; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NEXT: [[TMP18:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
-; CHECK-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
-; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP18:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NEXT: [[TMP19:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP7]]
+; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP14]])
+; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP19]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]])
+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
; CHECK: scalar.ph:
@@ -173,8 +173,8 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-NOI8MM: vector.body:
; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
@@ -193,14 +193,14 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
; CHECK-NOI8MM-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
-; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
+; CHECK-NOI8MM-NEXT: [[TMP22]] = add <vscale x 8 x i32> [[TMP20]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT: [[TMP23]] = add <vscale x 8 x i32> [[TMP21]], [[VEC_PHI1]]
; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-NOI8MM: middle.block:
-; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NOI8MM-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP22]]
+; CHECK-NOI8MM-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
; CHECK-NOI8MM: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 6e11e55..3a88273 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -12,74 +12,62 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP16]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP20]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP18]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVE1: scalar.ph:
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVE1: for.exit:
+; CHECK-INTERLEAVE1-NEXT: ret i32 [[TMP6]]
;
; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]]
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP14]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP20]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP20]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]]
-; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = shl nuw i64 [[TMP26]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP28]], i64 [[TMP27]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP28]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP29]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP8]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP9]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVED: scalar.ph:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVED: for.exit:
+; CHECK-INTERLEAVED-NEXT: ret i32 [[TMP11]]
;
; CHECK-MAXBW-LABEL: define i32 @dotp(
; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -139,78 +127,52 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP9]], 2
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP13]], [[TMP11]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add <vscale x 2 x i64> [[TMP14]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP15]])
-; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVE1: scalar.ph:
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVE1: for.exit:
+; CHECK-INTERLEAVE1-NEXT: ret i64 [[TMP4]]
;
; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_i8_to_i64_has_neon_dotprod(
; CHECK-INTERLEAVED-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP9]], 4
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP16]]
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 1
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP12]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP13]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1
-; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i64 [[TMP18]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD5]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP20]], [[TMP14]]
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP21]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 2 x i64> [[TMP22]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add <vscale x 2 x i64> [[TMP23]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
-; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]]
-; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVED: scalar.ph:
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVED: for.exit:
+; CHECK-INTERLEAVED-NEXT: ret i64 [[TMP4]]
;
; CHECK-MAXBW-LABEL: define i64 @not_dotp_i8_to_i64_has_neon_dotprod(
; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] {
@@ -274,86 +236,66 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = mul i64 [[TMP10]], 2
-; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP20]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = mul i64 [[TMP10]], 2
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP]], align 2
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP2]], align 2
-; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD3]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP15]], [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP17]] = add <vscale x 2 x i64> [[TMP16]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[NEXT_GEP2]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP1]], [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP17]])
-; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVE1: scalar.ph:
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVE1: for.exit:
+; CHECK-INTERLEAVE1-NEXT: ret i64 [[TMP4]]
;
; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_i16_to_i64_has_neon_dotprod(
; CHECK-INTERLEAVED-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP11]]
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP15]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP15]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 2
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX2]]
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shl nuw i64 [[TMP13]], 1
-; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP]], align 2
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i16>, ptr [[TMP30]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD4]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 1
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP3]], align 2
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 2 x i16>, ptr [[TMP21]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD5]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD6]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP22]], [[TMP16]]
-; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP23]], [[TMP17]]
-; CHECK-INTERLEAVED-NEXT: [[TMP26]] = add <vscale x 2 x i64> [[TMP24]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP27]] = add <vscale x 2 x i64> [[TMP25]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[WIDE_LOAD4]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[NEXT_GEP3]], align 2
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD5]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD6]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[TMP1]]
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP7]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP27]], [[TMP26]]
-; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP15]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVED: scalar.ph:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVED: for.exit:
+; CHECK-INTERLEAVED-NEXT: ret i64 [[TMP9]]
;
; CHECK-MAXBW-LABEL: define i64 @not_dotp_i16_to_i64_has_neon_dotprod(
; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1]] {
@@ -497,7 +439,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]]
@@ -656,7 +598,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
@@ -803,7 +745,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8
@@ -851,7 +793,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nuw i32 [[TMP29]], 8
@@ -952,7 +894,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8
@@ -990,7 +932,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 8
@@ -1058,22 +1000,18 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled(
; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP13]], 2
-; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP16]], 4
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 1
@@ -1085,38 +1023,38 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 3
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = mul nsw <vscale x 4 x i32> [[TMP21]], [[TMP36]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP23]] = add <vscale x 4 x i32> [[TMP38]], [[VEC_PHI3]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD6]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = mul nsw <vscale x 4 x i32> [[TMP25]], [[TMP42]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP30]] = add <vscale x 4 x i32> [[TMP28]], [[VEC_PHI2]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD7]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = mul nsw <vscale x 4 x i32> [[TMP31]], [[TMP33]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP35]] = add <vscale x 4 x i32> [[TMP34]], [[VEC_PHI1]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = mul nsw <vscale x 4 x i32> [[TMP37]], [[TMP39]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP41]] = add <vscale x 4 x i32> [[TMP40]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP12]], [[TMP23]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]])
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP19]])
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP21]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP22]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP41]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP35]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP30]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP23]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
@@ -1124,26 +1062,22 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP13]], 3
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 32
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP16]], 8
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 32
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE28:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE29:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 1
@@ -1155,90 +1089,74 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 3
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP56]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = shl nuw i64 [[TMP25]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 [[TMP26]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP72]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nsw <vscale x 4 x i32> [[TMP28]], [[TMP66]]
-; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <vscale x 4 x i32> [[TMP82]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT: [[TMP50]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP33]] = add <vscale x 4 x i32> [[TMP31]], [[VEC_PHI7]]
-; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = shl nuw i64 [[TMP35]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP36]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <vscale x 4 x i8>, ptr [[TMP37]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD11]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD12]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = shl nuw i64 [[TMP41]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP42]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <vscale x 4 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <vscale x 4 x i8>, ptr [[TMP43]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD13]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD14]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = mul nsw <vscale x 4 x i32> [[TMP38]], [[TMP44]]
-; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = mul nsw <vscale x 4 x i32> [[TMP39]], [[TMP45]]
-; CHECK-INTERLEAVED-NEXT: [[TMP48]] = add <vscale x 4 x i32> [[TMP46]], [[VEC_PHI4]]
-; CHECK-INTERLEAVED-NEXT: [[TMP49]] = add <vscale x 4 x i32> [[TMP47]], [[VEC_PHI5]]
-; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = shl nuw i64 [[TMP51]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP52]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD16:%.*]] = load <vscale x 4 x i8>, ptr [[TMP53]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD15]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD16]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = shl nuw i64 [[TMP57]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP58]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD17:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <vscale x 4 x i8>, ptr [[TMP59]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD17]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD18]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = mul nsw <vscale x 4 x i32> [[TMP54]], [[TMP60]]
-; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = mul nsw <vscale x 4 x i32> [[TMP55]], [[TMP61]]
-; CHECK-INTERLEAVED-NEXT: [[TMP64]] = add <vscale x 4 x i32> [[TMP62]], [[VEC_PHI2]]
-; CHECK-INTERLEAVED-NEXT: [[TMP65]] = add <vscale x 4 x i32> [[TMP63]], [[VEC_PHI3]]
-; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = shl nuw i64 [[TMP67]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP68]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <vscale x 4 x i8>, ptr [[TMP69]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD19]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD20]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = shl nuw i64 [[TMP73]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP74]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD22:%.*]] = load <vscale x 4 x i8>, ptr [[TMP75]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD21]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD22]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = mul nsw <vscale x 4 x i32> [[TMP70]], [[TMP76]]
-; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = mul nsw <vscale x 4 x i32> [[TMP71]], [[TMP77]]
-; CHECK-INTERLEAVED-NEXT: [[TMP80]] = add <vscale x 4 x i32> [[TMP78]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP81]] = add <vscale x 4 x i32> [[TMP79]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]]
+; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP17]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP18]])
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP23]]
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP24]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP25]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP30]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP31]]
+; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP48]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE22]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP33]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE23]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP34]])
+; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = mul nsw <16 x i32> [[TMP36]], [[TMP39]]
+; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = mul nsw <16 x i32> [[TMP37]], [[TMP40]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE28]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP41]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE29]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP42]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX23:%.*]] = add <vscale x 4 x i32> [[TMP65]], [[TMP64]]
-; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX23]])
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX24:%.*]] = add <vscale x 4 x i32> [[TMP49]], [[TMP48]]
-; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX24]])
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX25:%.*]] = add <vscale x 4 x i32> [[TMP33]], [[TMP50]]
-; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX25]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE29]], [[PARTIAL_REDUCE28]]
+; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX30:%.*]] = add <4 x i32> [[PARTIAL_REDUCE23]], [[PARTIAL_REDUCE22]]
+; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX30]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX31:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX31]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX32]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
@@ -1396,7 +1314,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
; CHECK-INTERLEAVE1-NEXT: br label [[EXIT:%.*]]
@@ -1434,7 +1352,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
; CHECK-INTERLEAVED-NEXT: br label [[EXIT:%.*]]
@@ -1525,7 +1443,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
@@ -1572,7 +1490,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
@@ -1607,7 +1525,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP24]])
; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
@@ -1666,7 +1584,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP14]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP15]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
@@ -1713,7 +1631,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add <vscale x 2 x i64> [[VEC_PHI1]], [[TMP23]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]]
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
@@ -1748,7 +1666,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[TMP14]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP13]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP14]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
@@ -1866,7 +1784,7 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add i32 [[TMP21]], [[TMP15]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP22]]
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -1978,7 +1896,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2016,7 +1934,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
@@ -2053,7 +1971,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-MAXBW-NEXT: [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2111,7 +2029,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2149,7 +2067,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
@@ -2186,7 +2104,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-MAXBW-NEXT: [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2226,36 +2144,32 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; CHECK-INTERLEAVE1: for.body.preheader:
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
-; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[COST]], i32 0
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP16]], [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP18]] = add <vscale x 2 x i64> [[TMP17]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP18]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
@@ -2267,50 +2181,32 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; CHECK-INTERLEAVED: for.body.preheader:
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[COST]], i32 0
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 1
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP15]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = shl nuw i64 [[TMP20]], 1
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 [[TMP21]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP2]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i8>, ptr [[TMP22]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD5]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP23]], [[TMP17]]
-; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP24]], [[TMP18]]
-; CHECK-INTERLEAVED-NEXT: [[TMP27]] = add <vscale x 2 x i64> [[TMP25]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP28]] = add <vscale x 2 x i64> [[TMP26]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP2]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP28]], [[TMP27]]
-; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
@@ -2349,7 +2245,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
; CHECK-MAXBW-NEXT: [[TMP20]] = add <vscale x 8 x i64> [[TMP17]], [[VEC_PHI]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP20]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -2471,7 +2367,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
; CHECK-INTERLEAVE1-NEXT: [[TMP36]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP36]])
; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP33]])
@@ -2571,7 +2467,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
@@ -2671,7 +2567,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
index 11ff688..7bb4715 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
@@ -12,77 +12,65 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sub <vscale x 4 x i32> zeroinitializer, [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = sub <16 x i32> zeroinitializer, [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP15]])
-; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVE1: scalar.ph:
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVE1: for.exit:
+; CHECK-INTERLEAVE1-NEXT: ret i32 [[TMP8]]
;
; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP9]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 2
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP20]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sub <vscale x 4 x i32> zeroinitializer, [[TMP21]]
-; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sub <vscale x 4 x i32> zeroinitializer, [[TMP22]]
-; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT: [[TMP26]] = add <vscale x 4 x i32> [[VEC_PHI1]], [[TMP24]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP14]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP15]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sub <16 x i32> zeroinitializer, [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP26]], [[TMP25]]
-; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVED: scalar.ph:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVED: for.exit:
+; CHECK-INTERLEAVED-NEXT: ret i32 [[TMP13]]
;
; CHECK-MAXBW-LABEL: define i32 @dotp(
; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index db3166c..3c2ae1c7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -17,16 +17,16 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 {
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP3]] = add <16 x i32> [[TMP2]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]]
; CHECK-INTERLEAVE1: scalar.ph:
;
@@ -38,22 +38,22 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 {
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP5]] = add <16 x i32> [[TMP3]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <16 x i32> [[TMP4]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP6]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]]
; CHECK-INTERLEAVED: scalar.ph:
;
@@ -199,16 +199,16 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 {
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = add <16 x i64> [[TMP3]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP3]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP4]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]]
; CHECK-INTERLEAVE1: scalar.ph:
;
@@ -220,22 +220,22 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 {
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <16 x i64> [[TMP4]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <16 x i64> [[TMP5]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP5]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i64> [[TMP7]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]]
; CHECK-INTERLEAVED: scalar.ph:
;
@@ -293,16 +293,16 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 {
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = add <8 x i64> [[TMP3]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP4]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]]
; CHECK-INTERLEAVE1: scalar.ph:
;
@@ -314,22 +314,22 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 {
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[TMP1]], i32 8
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <8 x i64> [[TMP5]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP5]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP7]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]]
; CHECK-INTERLEAVED: scalar.ph:
;
@@ -764,16 +764,16 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = add <16 x i32> [[TMP3]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]]
; CHECK-INTERLEAVE1: scalar.ph:
;
@@ -785,22 +785,22 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <16 x i32> [[TMP4]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <16 x i32> [[TMP5]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]]
; CHECK-INTERLEAVED: scalar.ph:
;
@@ -984,21 +984,21 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> zeroinitializer, i32 [[A]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
; CHECK-INTERLEAVE1-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP6]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP6]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
@@ -1015,26 +1015,26 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> zeroinitializer, i32 [[A]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP12]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[VEC_PHI1]]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP6]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <16 x i32> [[VEC_PHI2]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP3]])
; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add nuw i32 [[VEC_PHI1]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP22]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP8]], [[TMP7]]
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE2]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
index c61361b..25ee100 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
@@ -192,7 +192,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK: LV(REG): VF = 16
; CHECK-NEXT: LV(REG): Found max usage: 2 item
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 9 registers
-; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 24 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 12 registers
; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
entry:
br label %for.body
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index e09ddb4..731d648 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -1636,7 +1636,11 @@ static std::pair<Triple, SubtargetFeatures> getFirstFileTripleAndFeatures() {
case file_magic::macho_object: {
auto Obj = ExitOnErr(
object::ObjectFile::createObjectFile(ObjBuffer->getMemBufferRef()));
- Triple TT = Obj->makeTriple();
+ Triple TT;
+ if (auto *MachOObj = dyn_cast<object::MachOObjectFile>(Obj.get()))
+ TT = MachOObj->getArchTriple();
+ else
+ TT = Obj->makeTriple();
if (Magic == file_magic::coff_object) {
// TODO: Move this to makeTriple() if possible.
TT.setObjectFormat(Triple::COFF);
diff --git a/llvm/unittests/Object/BuildIDTest.cpp b/llvm/unittests/Object/BuildIDTest.cpp
new file mode 100644
index 0000000..04ca636
--- /dev/null
+++ b/llvm/unittests/Object/BuildIDTest.cpp
@@ -0,0 +1,120 @@
+//===- BuildIDTest.cpp - Tests for getBuildID ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/BuildID.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Testing/Support/Error.h"
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+template <class ELFT>
+static Expected<ELFObjectFile<ELFT>> toBinary(SmallVectorImpl<char> &Storage,
+ StringRef Yaml) {
+ raw_svector_ostream OS(Storage);
+ yaml::Input YIn(Yaml);
+ if (!yaml::convertYAML(YIn, OS, [](const Twine &Msg) {}))
+ return createStringError(std::errc::invalid_argument,
+ "unable to convert YAML");
+ return ELFObjectFile<ELFT>::create(MemoryBufferRef(OS.str(), "dummyELF"));
+}
+
+static StringRef getInvalidNoteELF(bool WithShdr) {
+ static std::string WithSection(R"(
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_EXEC
+ Machine: EM_X86_64
+ProgramHeaders:
+ - Type: PT_NOTE
+ FileSize: 0x1a
+ FirstSec: .note.gnu.build-id
+ LastSec: .note.gnu.build-id
+Sections:
+ - Name: .note.gnu.build-id
+ Type: SHT_NOTE
+ AddressAlign: 0x04
+ Notes:
+ - Name: "GNU"
+ Desc: "abb50d82b6bdc861"
+ Type: 3
+)");
+ static std::string WithoutSection(WithSection + R"(
+ - Type: SectionHeaderTable
+ NoHeaders: true
+)");
+ if (WithShdr)
+ return WithSection;
+ return WithoutSection;
+}
+
+// The BuildID can be looked up from a section header, if there is no program
+// header.
+TEST(BuildIDTest, InvalidPhdrFileSizeWithShdrs) {
+ SmallString<0> Storage;
+ Expected<ELFObjectFile<ELF64LE>> ElfOrErr =
+ toBinary<ELF64LE>(Storage, getInvalidNoteELF(true));
+ ASSERT_THAT_EXPECTED(ElfOrErr, Succeeded());
+ BuildIDRef BuildID = getBuildID(&ElfOrErr.get());
+ EXPECT_EQ(
+ StringRef(reinterpret_cast<const char *>(BuildID.data()), BuildID.size()),
+ "\xAB\xB5\x0D\x82\xB6\xBD\xC8\x61");
+}
+
+// The code handles a malformed program header that points at data outside the
+// file.
+TEST(BuildIDTest, InvalidPhdrFileSizeNoShdrs) {
+ SmallString<0> Storage;
+ Expected<ELFObjectFile<ELF64LE>> ElfOrErr =
+ toBinary<ELF64LE>(Storage, getInvalidNoteELF(false));
+ ASSERT_THAT_EXPECTED(ElfOrErr, Succeeded());
+ BuildIDRef BuildID = getBuildID(&ElfOrErr.get());
+ EXPECT_EQ(
+ StringRef(reinterpret_cast<const char *>(BuildID.data()), BuildID.size()),
+ "");
+}
+
+// The code handles a malformed section header that points at data outside the
+// file.
+TEST(BuildIDTest, InvalidSectionHeader) {
+ SmallString<0> Storage;
+ Expected<ELFObjectFile<ELF64LE>> ElfOrErr = toBinary<ELF64LE>(Storage, R"(
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_EXEC
+ Machine: EM_X86_64
+ProgramHeaders:
+ - Type: PT_NOTE
+ FirstSec: .note.gnu.build-id
+ LastSec: .note.gnu.build-id
+Sections:
+ - Name: .note.gnu.build-id
+ Type: SHT_NOTE
+ AddressAlign: 0x04
+ ShOffset: 0x1a1
+ Notes:
+ - Name: "GNU"
+ Desc: "abb50d82b6bdc861"
+ Type: 3
+)");
+ ASSERT_THAT_EXPECTED(ElfOrErr, Succeeded());
+ BuildIDRef BuildID = getBuildID(&ElfOrErr.get());
+ EXPECT_EQ(
+ StringRef(reinterpret_cast<const char *>(BuildID.data()), BuildID.size()),
+ "\xAB\xB5\x0D\x82\xB6\xBD\xC8\x61");
+}
diff --git a/llvm/unittests/Object/CMakeLists.txt b/llvm/unittests/Object/CMakeLists.txt
index 1343352..cd70a7b 100644
--- a/llvm/unittests/Object/CMakeLists.txt
+++ b/llvm/unittests/Object/CMakeLists.txt
@@ -7,6 +7,7 @@ set(LLVM_LINK_COMPONENTS
add_llvm_unittest(ObjectTests
ArchiveTest.cpp
+ BuildIDTest.cpp
COFFObjectFileTest.cpp
DXContainerTest.cpp
ELFObjectFileTest.cpp
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
index a943e7ac..b99d656 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
@@ -203,7 +203,7 @@ TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) {
VPInstruction::BranchOnCond,
{Plan->getOrAddLiveIn(ConstantInt::getTrue(F->getContext()))}));
VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
- Plan, [](PHINode *P) { return nullptr; }, TLI);
+ *Plan, [](PHINode *P) { return nullptr; }, TLI);
VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
EXPECT_EQ(0u, Entry->getNumPredecessors());
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanUncountableExitTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanUncountableExitTest.cpp
index eb075e6..b89d378 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanUncountableExitTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanUncountableExitTest.cpp
@@ -48,7 +48,7 @@ TEST_F(VPUncountableExitTest, FindUncountableExitRecipes) {
BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
auto Plan = buildVPlan(LoopHeader, /*HasUncountableExit=*/true);
VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
- Plan, [](PHINode *P) { return nullptr; }, *TLI);
+ *Plan, [](PHINode *P) { return nullptr; }, *TLI);
VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan);
SmallVector<VPRecipeBase *> Recipes;
@@ -85,7 +85,7 @@ TEST_F(VPUncountableExitTest, NoUncountableExit) {
BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
auto Plan = buildVPlan(LoopHeader);
VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
- Plan, [](PHINode *P) { return nullptr; }, *TLI);
+ *Plan, [](PHINode *P) { return nullptr; }, *TLI);
VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan);
SmallVector<VPRecipeBase *> Recipes;
diff --git a/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn
index b856d1c..764ebb9 100644
--- a/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn
@@ -28,6 +28,7 @@ static_library("Rewrite") {
"BuildIDRewriter.cpp",
"DWARFRewriter.cpp",
"ExecutableFileMemoryManager.cpp",
+ "GNUPropertyRewriter.cpp",
"JITLinkLinker.cpp",
"LinuxKernelRewriter.cpp",
"MachORewriteInstance.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn
index 9fcb05c..54193c8 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn
@@ -10,6 +10,7 @@ unittest("ObjectTests") {
]
sources = [
"ArchiveTest.cpp",
+ "BuildIDTest.cpp",
"COFFObjectFileTest.cpp",
"DXContainerTest.cpp",
"ELFObjectFileTest.cpp",
diff --git a/mlir/lib/CAPI/Transforms/Rewrite.cpp b/mlir/lib/CAPI/Transforms/Rewrite.cpp
index 8ee6308..0d56259 100644
--- a/mlir/lib/CAPI/Transforms/Rewrite.cpp
+++ b/mlir/lib/CAPI/Transforms/Rewrite.cpp
@@ -259,22 +259,23 @@ void mlirIRRewriterDestroy(MlirRewriterBase rewriter) {
/// RewritePatternSet and FrozenRewritePatternSet API
//===----------------------------------------------------------------------===//
-inline mlir::RewritePatternSet &unwrap(MlirRewritePatternSet module) {
+static inline mlir::RewritePatternSet &unwrap(MlirRewritePatternSet module) {
assert(module.ptr && "unexpected null module");
return *(static_cast<mlir::RewritePatternSet *>(module.ptr));
}
-inline MlirRewritePatternSet wrap(mlir::RewritePatternSet *module) {
+static inline MlirRewritePatternSet wrap(mlir::RewritePatternSet *module) {
return {module};
}
-inline mlir::FrozenRewritePatternSet *
+static inline mlir::FrozenRewritePatternSet *
unwrap(MlirFrozenRewritePatternSet module) {
assert(module.ptr && "unexpected null module");
return static_cast<mlir::FrozenRewritePatternSet *>(module.ptr);
}
-inline MlirFrozenRewritePatternSet wrap(mlir::FrozenRewritePatternSet *module) {
+static inline MlirFrozenRewritePatternSet
+wrap(mlir::FrozenRewritePatternSet *module) {
return {module};
}
@@ -321,12 +322,12 @@ inline MlirPatternRewriter wrap(mlir::PatternRewriter *rewriter) {
//===----------------------------------------------------------------------===//
#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
-inline mlir::PDLPatternModule *unwrap(MlirPDLPatternModule module) {
+static inline mlir::PDLPatternModule *unwrap(MlirPDLPatternModule module) {
assert(module.ptr && "unexpected null module");
return static_cast<mlir::PDLPatternModule *>(module.ptr);
}
-inline MlirPDLPatternModule wrap(mlir::PDLPatternModule *module) {
+static inline MlirPDLPatternModule wrap(mlir::PDLPatternModule *module) {
return {module};
}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index 3bd763e..05fc7cb 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -1622,12 +1622,12 @@ static void generateCollapsedIndexingRegion(
}
}
-void collapseOperandsAndResults(LinalgOp op,
- const CollapsingInfo &collapsingInfo,
- RewriterBase &rewriter,
- SmallVectorImpl<Value> &inputOperands,
- SmallVectorImpl<Value> &outputOperands,
- SmallVectorImpl<Type> &resultTypes) {
+static void collapseOperandsAndResults(LinalgOp op,
+ const CollapsingInfo &collapsingInfo,
+ RewriterBase &rewriter,
+ SmallVectorImpl<Value> &inputOperands,
+ SmallVectorImpl<Value> &outputOperands,
+ SmallVectorImpl<Type> &resultTypes) {
Location loc = op->getLoc();
inputOperands =
llvm::map_to_vector(op.getDpsInputOperands(), [&](OpOperand *opOperand) {
@@ -1651,8 +1651,8 @@ void collapseOperandsAndResults(LinalgOp op,
/// Clone a `LinalgOp` to a collapsed version of same name
template <typename OpTy>
-OpTy cloneToCollapsedOp(RewriterBase &rewriter, OpTy origOp,
- const CollapsingInfo &collapsingInfo) {
+static OpTy cloneToCollapsedOp(RewriterBase &rewriter, OpTy origOp,
+ const CollapsingInfo &collapsingInfo) {
return nullptr;
}
@@ -1699,8 +1699,9 @@ GenericOp cloneToCollapsedOp<GenericOp>(RewriterBase &rewriter,
return collapsedOp;
}
-LinalgOp createCollapsedOp(LinalgOp op, const CollapsingInfo &collapsingInfo,
- RewriterBase &rewriter) {
+static LinalgOp createCollapsedOp(LinalgOp op,
+ const CollapsingInfo &collapsingInfo,
+ RewriterBase &rewriter) {
if (GenericOp genericOp = dyn_cast<GenericOp>(op.getOperation())) {
return cloneToCollapsedOp(rewriter, genericOp, collapsingInfo);
} else {
diff --git a/mlir/tools/mlir-rewrite/mlir-rewrite.cpp b/mlir/tools/mlir-rewrite/mlir-rewrite.cpp
index fd8ae7e..795766f 100644
--- a/mlir/tools/mlir-rewrite/mlir-rewrite.cpp
+++ b/mlir/tools/mlir-rewrite/mlir-rewrite.cpp
@@ -35,7 +35,7 @@ namespace mlir {
using OperationDefinition = AsmParserState::OperationDefinition;
/// Return the source code associated with the OperationDefinition.
-SMRange getOpRange(const OperationDefinition &op) {
+static SMRange getOpRange(const OperationDefinition &op) {
const char *startOp = op.scopeLoc.Start.getPointer();
const char *endOp = op.scopeLoc.End.getPointer();
@@ -187,15 +187,15 @@ std::unique_ptr<RewritePad> RewritePad::init(StringRef inputFilename,
}
/// Return the source code associated with the operation name.
-SMRange getOpNameRange(const OperationDefinition &op) { return op.loc; }
+static SMRange getOpNameRange(const OperationDefinition &op) { return op.loc; }
/// Return whether the operation was printed using generic syntax in original
/// buffer.
-bool isGeneric(const OperationDefinition &op) {
+static bool isGeneric(const OperationDefinition &op) {
return op.loc.Start.getPointer()[0] == '"';
}
-inline int asMainReturnCode(LogicalResult r) {
+static inline int asMainReturnCode(LogicalResult r) {
return r.succeeded() ? EXIT_SUCCESS : EXIT_FAILURE;
}
@@ -293,7 +293,7 @@ static llvm::cl::opt<std::string> simpleRenameReplace{
llvm::cl::cat(clSimpleRenameCategory)};
// Rewriter that does simple renames.
-LogicalResult simpleRename(RewritePad &rewriteState, raw_ostream &os) {
+static LogicalResult simpleRename(RewritePad &rewriteState, raw_ostream &os) {
StringRef opName = simpleRenameOpName;
StringRef match = simpleRenameMatch;
StringRef replace = simpleRenameReplace;
@@ -317,7 +317,7 @@ static mlir::RewriterRegistration rewriteSimpleRename("simple-rename",
simpleRename);
// Rewriter that insert range markers.
-LogicalResult markRanges(RewritePad &rewriteState, raw_ostream &os) {
+static LogicalResult markRanges(RewritePad &rewriteState, raw_ostream &os) {
for (const auto &it : rewriteState.getOpDefs()) {
auto [startOp, endOp] = getOpRange(it);
diff --git a/mlir/unittests/TableGen/PassGenTest.cpp b/mlir/unittests/TableGen/PassGenTest.cpp
index 27f2fa0..ac01d49 100644
--- a/mlir/unittests/TableGen/PassGenTest.cpp
+++ b/mlir/unittests/TableGen/PassGenTest.cpp
@@ -11,7 +11,8 @@
#include "gmock/gmock.h"
-std::unique_ptr<mlir::Pass> createTestPassWithCustomConstructor(int v = 0);
+static std::unique_ptr<mlir::Pass>
+createTestPassWithCustomConstructor(int v = 0);
#define GEN_PASS_DECL
#define GEN_PASS_REGISTRATION
diff --git a/orc-rt/include/orc-rt/SPSWrapperFunction.h b/orc-rt/include/orc-rt/SPSWrapperFunction.h
index 3ea6406..14a3d8e 100644
--- a/orc-rt/include/orc-rt/SPSWrapperFunction.h
+++ b/orc-rt/include/orc-rt/SPSWrapperFunction.h
@@ -21,8 +21,10 @@ namespace orc_rt {
namespace detail {
template <typename... SPSArgTs> struct WFSPSHelper {
- template <typename... ArgTs>
- std::optional<WrapperFunctionBuffer> serialize(const ArgTs &...Args) {
+private:
+ template <typename... SerializableArgTs>
+ std::optional<WrapperFunctionBuffer>
+ serializeImpl(const SerializableArgTs &...Args) {
auto R =
WrapperFunctionBuffer::allocate(SPSArgList<SPSArgTs...>::size(Args...));
SPSOutputBuffer OB(R.data(), R.size());
@@ -31,16 +33,61 @@ template <typename... SPSArgTs> struct WFSPSHelper {
return std::move(R);
}
+ template <typename T> static const T &toSerializable(const T &Arg) noexcept {
+ return Arg;
+ }
+
+ static SPSSerializableError toSerializable(Error Err) noexcept {
+ return SPSSerializableError(std::move(Err));
+ }
+
+ template <typename T>
+ static SPSSerializableExpected<T> toSerializable(Expected<T> Arg) noexcept {
+ return SPSSerializableExpected<T>(std::move(Arg));
+ }
+
+ template <typename... Ts> struct DeserializableTuple;
+
+ template <typename... Ts> struct DeserializableTuple<std::tuple<Ts...>> {
+ typedef std::tuple<
+ std::decay_t<decltype(toSerializable(std::declval<Ts>()))>...>
+ type;
+ };
+
+ template <typename... Ts>
+ using DeserializableTuple_t = typename DeserializableTuple<Ts...>::type;
+
+ template <typename T> static T fromSerializable(T &&Arg) noexcept {
+ return Arg;
+ }
+
+ static Error fromSerializable(SPSSerializableError Err) noexcept {
+ return Err.toError();
+ }
+
+ template <typename T>
+ static Expected<T> fromSerializable(SPSSerializableExpected<T> Val) noexcept {
+ return Val.toExpected();
+ }
+
+public:
+ template <typename... ArgTs>
+ std::optional<WrapperFunctionBuffer> serialize(ArgTs &&...Args) {
+ return serializeImpl(toSerializable(std::forward<ArgTs>(Args))...);
+ }
+
template <typename ArgTuple>
std::optional<ArgTuple> deserialize(WrapperFunctionBuffer ArgBytes) {
assert(!ArgBytes.getOutOfBandError() &&
"Should not attempt to deserialize out-of-band error");
SPSInputBuffer IB(ArgBytes.data(), ArgBytes.size());
- ArgTuple Args;
- if (!SPSSerializationTraits<SPSTuple<SPSArgTs...>, ArgTuple>::deserialize(
- IB, Args))
+ DeserializableTuple_t<ArgTuple> Args;
+ if (!SPSSerializationTraits<SPSTuple<SPSArgTs...>,
+ decltype(Args)>::deserialize(IB, Args))
return std::nullopt;
- return Args;
+ return std::apply(
+ [](auto &&...A) { return ArgTuple(fromSerializable(A)...); },
+ std::move(Args));
}
};
diff --git a/orc-rt/include/orc-rt/WrapperFunction.h b/orc-rt/include/orc-rt/WrapperFunction.h
index 233c3b2..ca165db 100644
--- a/orc-rt/include/orc-rt/WrapperFunction.h
+++ b/orc-rt/include/orc-rt/WrapperFunction.h
@@ -168,7 +168,8 @@ struct ResultDeserializer<std::tuple<Expected<T>>, Serializer> {
Serializer &S) {
if (auto Val = S.result().template deserialize<std::tuple<T>>(
std::move(ResultBytes)))
- return std::move(std::get<0>(*Val));
+ return Expected<T>(std::move(std::get<0>(*Val)),
+ ForceExpectedSuccessValue());
else
return make_error<StringError>("Could not deserialize result");
}
diff --git a/orc-rt/unittests/SPSWrapperFunctionTest.cpp b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
index 0b65515..c0c86ff 100644
--- a/orc-rt/unittests/SPSWrapperFunctionTest.cpp
+++ b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
@@ -144,3 +144,77 @@ TEST(SPSWrapperFunctionUtilsTest, TestBinaryOpViaFunctionPointer) {
[&](Expected<int32_t> R) { Result = cantFail(std::move(R)); }, 41, 1);
EXPECT_EQ(Result, 42);
}
+
+static void improbable_feat_sps_wrapper(orc_rt_SessionRef Session,
+ void *CallCtx,
+ orc_rt_WrapperFunctionReturn Return,
+ orc_rt_WrapperFunctionBuffer ArgBytes) {
+ SPSWrapperFunction<SPSError(bool)>::handle(
+ Session, CallCtx, Return, ArgBytes,
+ [](move_only_function<void(Error)> Return, bool LuckyHat) {
+ if (LuckyHat)
+ Return(Error::success());
+ else
+ Return(make_error<StringError>("crushed by boulder"));
+ });
+}
+
+TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningErrorSuccessCase) {
+ bool DidRun = false;
+ SPSWrapperFunction<SPSError(bool)>::call(
+ DirectCaller(nullptr, improbable_feat_sps_wrapper),
+ [&](Expected<Error> E) {
+ DidRun = true;
+ cantFail(cantFail(std::move(E)));
+ },
+ true);
+
+ EXPECT_TRUE(DidRun);
+}
+
+TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningErrorFailureCase) {
+ std::string ErrMsg;
+ SPSWrapperFunction<SPSError(bool)>::call(
+ DirectCaller(nullptr, improbable_feat_sps_wrapper),
+ [&](Expected<Error> E) { ErrMsg = toString(cantFail(std::move(E))); },
+ false);
+
+ EXPECT_EQ(ErrMsg, "crushed by boulder");
+}
+
+static void halve_number_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+ orc_rt_WrapperFunctionReturn Return,
+ orc_rt_WrapperFunctionBuffer ArgBytes) {
+ SPSWrapperFunction<SPSExpected<int32_t>(int32_t)>::handle(
+ Session, CallCtx, Return, ArgBytes,
+ [](move_only_function<void(Expected<int32_t>)> Return, int N) {
+ if (N % 2 == 0)
+ Return(N >> 1);
+ else
+ Return(make_error<StringError>("N is not a multiple of 2"));
+ });
+}
+
+TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningExpectedSuccessCase) {
+ int32_t Result = 0;
+ SPSWrapperFunction<SPSExpected<int32_t>(int32_t)>::call(
+ DirectCaller(nullptr, halve_number_sps_wrapper),
+ [&](Expected<Expected<int32_t>> R) {
+ Result = cantFail(cantFail(std::move(R)));
+ },
+ 2);
+
+ EXPECT_EQ(Result, 1);
+}
+
+TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningExpectedFailureCase) {
+ std::string ErrMsg;
+ SPSWrapperFunction<SPSExpected<int32_t>(int32_t)>::call(
+ DirectCaller(nullptr, halve_number_sps_wrapper),
+ [&](Expected<Expected<int32_t>> R) {
+ ErrMsg = toString(cantFail(std::move(R)).takeError());
+ },
+ 3);
+
+ EXPECT_EQ(ErrMsg, "N is not a multiple of 2");
+}