aboutsummaryrefslogtreecommitdiff
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/.clang-format2
-rw-r--r--llvm/docs/TableGen/ProgRef.rst22
-rw-r--r--llvm/include/llvm/ADT/BitVector.h5
-rw-r--r--llvm/include/llvm/ADT/ConcurrentHashtable.h5
-rw-r--r--llvm/include/llvm/ADT/DirectedGraph.h10
-rw-r--r--llvm/include/llvm/Analysis/IR2Vec.h9
-rw-r--r--llvm/include/llvm/Analysis/MemoryProfileInfo.h8
-rw-r--r--llvm/include/llvm/CodeGen/MIRYamlMapping.h2
-rw-r--r--llvm/include/llvm/CodeGen/MachineFrameInfo.h13
-rw-r--r--llvm/include/llvm/CodeGen/TargetFrameLowering.h1
-rw-r--r--llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h1
-rw-r--r--llvm/include/llvm/IR/IntrinsicInst.h20
-rw-r--r--llvm/include/llvm/IR/IntrinsicsAArch64.td30
-rw-r--r--llvm/lib/Analysis/IR2Vec.cpp86
-rw-r--r--llvm/lib/Analysis/MemoryProfileInfo.cpp22
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp7
-rw-r--r--llvm/lib/CodeGen/RegAllocGreedy.cpp71
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp50
-rw-r--r--llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp2
-rw-r--r--llvm/lib/ExecutionEngine/JITLink/JITLink.cpp3
-rw-r--r--llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp30
-rw-r--r--llvm/lib/Support/APFloat.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp3
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp492
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.h30
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp4
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp19
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrFormats.td22
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp8
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp20
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h83
-rw-r--r--llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp444
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td16
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td2
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h10
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp1
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp5
-rw-r--r--llvm/lib/Target/RISCV/RISCVFrameLowering.cpp1
-rw-r--r--llvm/lib/Target/RISCV/RISCVGISel.td12
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoA.td9
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td52
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td20
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp43
-rw-r--r--llvm/lib/Target/VE/VEISelLowering.cpp2
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp44
-rw-r--r--llvm/lib/Target/X86/X86LowerAMXType.cpp30
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp48
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h1
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp29
-rw-r--r--llvm/lib/Transforms/Utils/LoopPeel.cpp121
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp23
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp19
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHelpers.h20
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp130
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp10
-rw-r--r--llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll126
-rw-r--r--llvm/test/CMakeLists.txt4
-rw-r--r--llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir8
-rw-r--r--llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir4
-rw-r--r--llvm/test/CodeGen/AArch64/framelayout-split-sve.mir587
-rw-r--r--llvm/test/CodeGen/AArch64/framelayout-sve.mir12
-rw-r--r--llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir16
-rw-r--r--llvm/test/CodeGen/AArch64/spillfill-sve.mir10
-rw-r--r--llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll824
-rw-r--r--llvm/test/CodeGen/AArch64/stack-hazard.ll876
-rw-r--r--llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll10
-rw-r--r--llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll2854
-rw-r--r--llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir48
-rw-r--r--llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir12
-rw-r--r--llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/limit-coalesce.mir33
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll92
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll89
-rw-r--r--llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir12
-rw-r--r--llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir16
-rw-r--r--llvm/test/CodeGen/ARM/and-mask-variable.ll90
-rw-r--r--llvm/test/CodeGen/ARM/extract-bits.ll4591
-rw-r--r--llvm/test/CodeGen/ARM/extract-lowbits.ll2752
-rw-r--r--llvm/test/CodeGen/ARM/llrint-conv.ll69
-rw-r--r--llvm/test/CodeGen/ARM/lrint-conv.ll37
-rw-r--r--llvm/test/CodeGen/ARM/vector-lrint.ll20
-rw-r--r--llvm/test/CodeGen/Hexagon/unaligned-vec-store.ll23
-rw-r--r--llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll145
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/remat.ll132
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll53
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-resources/UniqueImplicitBindingNumber.ll19
-rw-r--r--llvm/test/CodeGen/SystemZ/fp-cmp-04.ll4
-rw-r--r--llvm/test/CodeGen/VE/Vector/vec_divrem.ll56
-rw-r--r--llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll78
-rw-r--r--llvm/test/CodeGen/X86/fshl.ll81
-rw-r--r--llvm/test/CodeGen/X86/fshr.ll90
-rw-r--r--llvm/test/CodeGen/X86/pr161693.ll40
-rw-r--r--llvm/test/CodeGen/X86/sbb.ll29
-rw-r--r--llvm/test/CodeGen/X86/shift-i128.ll3
-rw-r--r--llvm/test/DebugInfo/AArch64/asan-stack-vars.mir3
-rw-r--r--llvm/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir3
-rw-r--r--llvm/test/DebugInfo/X86/dynamic-bitfield.ll13
-rw-r--r--llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s2
-rw-r--r--llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-0.s7
-rw-r--r--llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-1.s7
-rw-r--r--llvm/test/ExecutionEngine/JITLink/AArch64/MachO_universal_slice_selection.s32
-rw-r--r--llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s2
-rw-r--r--llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s2
-rw-r--r--llvm/test/Transforms/GVN/condprop.ll60
-rw-r--r--llvm/test/Transforms/InstCombine/fcmp.ll40
-rw-r--r--llvm/test/Transforms/InstCombine/icmp-clamp.ll295
-rw-r--r--llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll75
-rw-r--r--llvm/test/Transforms/LoopUnroll/peel-branch-weights.ll64
-rw-r--r--llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll11
-rw-r--r--llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll13
-rw-r--r--llvm/test/Transforms/LoopUnroll/scev-invalidation-lcssa.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll46
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll34
-rw-r--r--llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll244
-rw-r--r--llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll46
-rw-r--r--llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll46
-rw-r--r--llvm/test/Transforms/LoopVectorize/reduction-order.ll116
-rw-r--r--llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll47
-rw-r--r--llvm/tools/llvm-jitlink/llvm-jitlink.cpp6
-rw-r--r--llvm/unittests/ADT/APFloatTest.cpp7
-rw-r--r--llvm/unittests/Analysis/MemoryProfileInfoTest.cpp21
-rw-r--r--llvm/unittests/Frontend/CMakeLists.txt1
-rw-r--r--llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn1
129 files changed, 15748 insertions, 1676 deletions
diff --git a/llvm/.clang-format b/llvm/.clang-format
index 5bead5f..ecb44bf 100644
--- a/llvm/.clang-format
+++ b/llvm/.clang-format
@@ -1,2 +1,2 @@
BasedOnStyle: LLVM
-
+LineEnding: LF
diff --git a/llvm/docs/TableGen/ProgRef.rst b/llvm/docs/TableGen/ProgRef.rst
index 2b1af05..0ff4cc7 100644
--- a/llvm/docs/TableGen/ProgRef.rst
+++ b/llvm/docs/TableGen/ProgRef.rst
@@ -64,7 +64,7 @@ Classes and concrete records have a unique *name*, either chosen by
the programmer or generated by TableGen. Associated with that name
is a list of *fields* with values and an optional list of *parent classes*
(sometimes called base or super classes). The fields are the primary data that
-backends will process. Note that TableGen assigns no meanings to fields; the
+backends will process. Note that TableGen assigns no meaning to fields; the
meanings are entirely up to the backends and the programs that incorporate
the output of those backends.
@@ -243,7 +243,7 @@ Include files
-------------
TableGen has an include mechanism. The content of the included file
-lexically replaces the ``include`` directive and is then parsed as if it was
+lexically replaces the ``include`` directive and is then parsed as if it were
originally in the main file.
.. productionlist::
@@ -670,17 +670,17 @@ name of a multiclass.
The argument values can be specified in two forms:
* Positional argument (``value``). The value is assigned to the argument in the
- corresponding position. For ``Foo<a0, a1>``, ``a0`` will be assigned to first
- argument and ``a1`` will be assigned to second argument.
+ corresponding position. For ``Foo<a0, a1>``, ``a0`` will be assigned to the first
+ argument and ``a1`` will be assigned to the second argument.
* Named argument (``name=value``). The value is assigned to the argument with
the specified name. For ``Foo<a=a0, b=a1>``, ``a0`` will be assigned to the
argument with name ``a`` and ``a1`` will be assigned to the argument with
name ``b``.
-Required arguments can also be specified as named argument.
+Required arguments can also be specified as a named argument.
Note that the argument can only be specified once regardless of the way (named
-or positional) to specify and positional arguments should be put before named
+or positional) to specify and positional arguments should precede named
arguments.
.. productionlist::
@@ -817,7 +817,7 @@ type. It provides a single field, ``Value``, which holds a 3-bit number. Its
template argument, ``val``, is used to set the ``Value`` field. Each of the
eight records is defined with ``FPFormat`` as its parent class. The
enumeration value is passed in angle brackets as the template argument. Each
-record will inherent the ``Value`` field with the appropriate enumeration
+record will inherit the ``Value`` field with the appropriate enumeration
value.
Here is a more complex example of classes with template arguments. First, we
@@ -1308,7 +1308,7 @@ with ``F0``, ``F1``, ``F2``, and ``F3``.
-------------------------------------
A ``dump`` statement prints the input string to standard error
-output. It is intended for debugging purpose.
+output. It is intended for debugging purposes.
* At top level, the message is printed immediately.
@@ -1727,7 +1727,7 @@ and non-0 as true.
``!div(``\ *a*\ ``,`` *b*\ ``)``
This operator performs signed division of *a* by *b*, and produces the quotient.
- Division by 0 produces an error. Division of INT64_MIN by -1 produces an error.
+ Division by 0 produces an error. Division of ``INT64_MIN`` by -1 produces an error.
``!empty(``\ *a*\ ``)``
This operator produces 1 if the string, list, or DAG *a* is empty; 0 otherwise.
@@ -1914,7 +1914,7 @@ and non-0 as true.
``!or(``\ *a*\ ``,`` *b*\ ``, ...)``
This operator does a bitwise OR on *a*, *b*, etc., and produces the
result. A logical OR can be performed if all the arguments are either
- 0 or 1. This operator is short-circuit to -1 (all ones) the left-most
+ 0 or 1. This operator is short-circuit to -1 (all ones) when the left-most
operand is -1.
``!range([``\ *start*\ ``,]`` *end*\ ``[,``\ *step*\ ``])``
@@ -1937,7 +1937,7 @@ and non-0 as true.
Equivalent to ``!range(0, !size(list))``.
``!repr(``\ *value*\ ``)``
- Represents *value* as a string. String format for the value is not
+ Represents *value* as a string. The string format for the value is not
guaranteed to be stable. Intended for debugging purposes only.
``!setdagarg(``\ *dag*\ ``,``\ *key*\ ``,``\ *arg*\ ``)``
diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h
index 83350e6..9e81a4b 100644
--- a/llvm/include/llvm/ADT/BitVector.h
+++ b/llvm/include/llvm/ADT/BitVector.h
@@ -570,10 +570,7 @@ public:
template <class F, class... ArgTys>
static BitVector &apply(F &&f, BitVector &Out, BitVector const &Arg,
ArgTys const &...Args) {
- assert(llvm::all_of(
- std::initializer_list<unsigned>{Args.size()...},
- [&Arg](auto const &BV) { return Arg.size() == BV; }) &&
- "consistent sizes");
+ assert(((Arg.size() == Args.size()) && ...) && "consistent sizes");
Out.resize(Arg.size());
for (size_type I = 0, E = Arg.Bits.size(); I != E; ++I)
Out.Bits[I] = f(Arg.Bits[I], Args.Bits[I]...);
diff --git a/llvm/include/llvm/ADT/ConcurrentHashtable.h b/llvm/include/llvm/ADT/ConcurrentHashtable.h
index 6de194d..6a943c5 100644
--- a/llvm/include/llvm/ADT/ConcurrentHashtable.h
+++ b/llvm/include/llvm/ADT/ConcurrentHashtable.h
@@ -253,9 +253,8 @@ public:
OS << "\nOverall number of entries = " << OverallNumberOfEntries;
OS << "\nOverall number of non empty buckets = " << NumberOfNonEmptyBuckets;
- for (auto &BucketSize : BucketSizesMap)
- OS << "\n Number of buckets with size " << BucketSize.first << ": "
- << BucketSize.second;
+ for (auto [Size, Count] : BucketSizesMap)
+ OS << "\n Number of buckets with size " << Size << ": " << Count;
std::stringstream stream;
stream << std::fixed << std::setprecision(2)
diff --git a/llvm/include/llvm/ADT/DirectedGraph.h b/llvm/include/llvm/ADT/DirectedGraph.h
index 83c0bea..fb6b180 100644
--- a/llvm/include/llvm/ADT/DirectedGraph.h
+++ b/llvm/include/llvm/ADT/DirectedGraph.h
@@ -181,16 +181,6 @@ public:
DirectedGraph() = default;
explicit DirectedGraph(NodeType &N) : Nodes() { addNode(N); }
- DirectedGraph(const DGraphType &G) : Nodes(G.Nodes) {}
- DirectedGraph(DGraphType &&RHS) : Nodes(std::move(RHS.Nodes)) {}
- DGraphType &operator=(const DGraphType &G) {
- Nodes = G.Nodes;
- return *this;
- }
- DGraphType &operator=(const DGraphType &&G) {
- Nodes = std::move(G.Nodes);
- return *this;
- }
const_iterator begin() const { return Nodes.begin(); }
const_iterator end() const { return Nodes.end(); }
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index b7c3015..ed43f19 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -210,6 +210,13 @@ public:
const_iterator end() const {
return const_iterator(this, getNumSections(), 0);
}
+
+ using VocabMap = std::map<std::string, Embedding>;
+ /// Parse a vocabulary section from JSON and populate the target vocabulary
+ /// map.
+ static Error parseVocabSection(StringRef Key,
+ const json::Value &ParsedVocabValue,
+ VocabMap &TargetVocab, unsigned &Dim);
};
/// Class for storing and accessing the IR2Vec vocabulary.
@@ -600,8 +607,6 @@ class IR2VecVocabAnalysis : public AnalysisInfoMixin<IR2VecVocabAnalysis> {
Error readVocabulary(VocabMap &OpcVocab, VocabMap &TypeVocab,
VocabMap &ArgVocab);
- Error parseVocabSection(StringRef Key, const json::Value &ParsedVocabValue,
- VocabMap &TargetVocab, unsigned &Dim);
void generateVocabStorage(VocabMap &OpcVocab, VocabMap &TypeVocab,
VocabMap &ArgVocab);
void emitError(Error Err, LLVMContext &Ctx);
diff --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
index be690a4..571caf9 100644
--- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h
+++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
@@ -59,14 +59,6 @@ LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type);
/// True if the AllocTypes bitmask contains just a single type.
LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes);
-/// Removes any existing "ambiguous" memprof attribute. Called before we apply a
-/// specific allocation type such as "cold", "notcold", or "hot".
-LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB);
-
-/// Adds an "ambiguous" memprof attribute to call with a matched allocation
-/// profile but that we haven't yet been able to disambiguate.
-LLVM_ABI void addAmbiguousAttribute(CallBase *CB);
-
/// Class to build a trie of call stack contexts for a particular profiled
/// allocation call, along with their associated allocation types.
/// The allocation will be at the root of the trie, which is then used to
diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index c7304e3..e80c138 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -378,6 +378,8 @@ struct ScalarEnumerationTraits<TargetStackID::Value> {
IO.enumCase(ID, "default", TargetStackID::Default);
IO.enumCase(ID, "sgpr-spill", TargetStackID::SGPRSpill);
IO.enumCase(ID, "scalable-vector", TargetStackID::ScalableVector);
+ IO.enumCase(ID, "scalable-predicate-vector",
+ TargetStackID::ScalablePredicateVector);
IO.enumCase(ID, "wasm-local", TargetStackID::WasmLocal);
IO.enumCase(ID, "noalloc", TargetStackID::NoAlloc);
}
diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index 00c7343..50ce931 100644
--- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -497,7 +497,18 @@ public:
/// Should this stack ID be considered in MaxAlignment.
bool contributesToMaxAlignment(uint8_t StackID) {
return StackID == TargetStackID::Default ||
- StackID == TargetStackID::ScalableVector;
+ StackID == TargetStackID::ScalableVector ||
+ StackID == TargetStackID::ScalablePredicateVector;
+ }
+
+ bool hasScalableStackID(int ObjectIdx) const {
+ uint8_t StackID = getStackID(ObjectIdx);
+ return isScalableStackID(StackID);
+ }
+
+ bool isScalableStackID(uint8_t StackID) const {
+ return StackID == TargetStackID::ScalableVector ||
+ StackID == TargetStackID::ScalablePredicateVector;
}
/// setObjectAlignment - Change the alignment of the specified stack object.
diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index 0e29e45..75696faf 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -32,6 +32,7 @@ enum Value {
SGPRSpill = 1,
ScalableVector = 2,
WasmLocal = 3,
+ ScalablePredicateVector = 4,
NoAlloc = 255
};
}
diff --git a/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h b/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h
index 52af205..ffe0b50 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h
@@ -179,6 +179,7 @@ public:
class DWARFDataExtractorSimple
: public DWARFDataExtractorBase<DWARFDataExtractorSimple> {
+public:
using DWARFDataExtractorBase::DWARFDataExtractorBase;
LLVM_ABI uint64_t getRelocatedValueImpl(uint32_t Size, uint64_t *Off,
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index eb0440f..0622bfa 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -810,6 +810,26 @@ public:
/// Whether the intrinsic is signed or unsigned.
bool isSigned() const { return isSigned(getIntrinsicID()); };
+ /// Whether the intrinsic is a smin or umin.
+ static bool isMin(Intrinsic::ID ID) {
+ switch (ID) {
+ case Intrinsic::umin:
+ case Intrinsic::smin:
+ return true;
+ case Intrinsic::umax:
+ case Intrinsic::smax:
+ return false;
+ default:
+ llvm_unreachable("Invalid intrinsic");
+ }
+ }
+
+ /// Whether the intrinsic is a smin or a umin.
+ bool isMin() const { return isMin(getIntrinsicID()); }
+
+ /// Whether the intrinsic is a smax or a umax.
+ bool isMax() const { return !isMin(getIntrinsicID()); }
+
/// Min/max intrinsics are monotonic, they operate on a fixed-bitwidth values,
/// so there is a certain threshold value, upon reaching which,
/// their value can no longer change. Return said threshold.
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 7c9aef5..fbc92d7 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -130,8 +130,6 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
class AdvSIMD_1VectorArg_Expand_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
- class AdvSIMD_1VectorArg_Long_Intrinsic
- : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMTruncatedType<0>], [IntrNoMem]>;
class AdvSIMD_1IntArg_Narrow_Intrinsic
: DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty], [IntrNoMem]>;
class AdvSIMD_1VectorArg_Narrow_Intrinsic
@@ -150,9 +148,6 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
class AdvSIMD_2VectorArg_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem]>;
- class AdvSIMD_2VectorArg_Compare_Intrinsic
- : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>],
- [IntrNoMem]>;
class AdvSIMD_2Arg_FloatCompare_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>],
[IntrNoMem]>;
@@ -160,10 +155,6 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMTruncatedType<0>, LLVMTruncatedType<0>],
[IntrNoMem]>;
- class AdvSIMD_2VectorArg_Wide_Intrinsic
- : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>, LLVMTruncatedType<0>],
- [IntrNoMem]>;
class AdvSIMD_2VectorArg_Narrow_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMExtendedType<0>, LLVMExtendedType<0>],
@@ -172,10 +163,6 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
: DefaultAttrsIntrinsic<[llvm_anyint_ty],
[LLVMExtendedType<0>, llvm_i32_ty],
[IntrNoMem]>;
- class AdvSIMD_2VectorArg_Scalar_Expand_BySize_Intrinsic
- : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
- [llvm_anyvector_ty],
- [IntrNoMem]>;
class AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMTruncatedType<0>],
@@ -184,10 +171,6 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMTruncatedType<0>, llvm_i32_ty],
[IntrNoMem]>;
- class AdvSIMD_2VectorArg_Tied_Narrow_Intrinsic
- : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
- [LLVMOneNthElementsVectorType<0, 2>, llvm_anyvector_ty],
- [IntrNoMem]>;
class AdvSIMD_2VectorArg_Lane_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyint_ty],
[LLVMMatchType<0>, llvm_anyint_ty, llvm_i32_ty],
@@ -205,14 +188,6 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
[IntrNoMem]>;
- class AdvSIMD_3VectorArg_Tied_Narrow_Intrinsic
- : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
- [LLVMOneNthElementsVectorType<0, 2>, llvm_anyvector_ty,
- LLVMMatchType<1>], [IntrNoMem]>;
- class AdvSIMD_3VectorArg_Scalar_Tied_Narrow_Intrinsic
- : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
- [LLVMOneNthElementsVectorType<0, 2>, llvm_anyvector_ty, llvm_i32_ty],
- [IntrNoMem]>;
class AdvSIMD_CvtFxToFP_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty],
[IntrNoMem]>;
@@ -238,11 +213,6 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
[IntrNoMem]>;
- class AdvSIMD_FML_Intrinsic
- : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
- [IntrNoMem]>;
-
class AdvSIMD_BF16FML_Intrinsic
: DefaultAttrsIntrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index af30422..295b6d3 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -330,6 +330,43 @@ bool VocabStorage::const_iterator::operator!=(
return !(*this == Other);
}
+Error VocabStorage::parseVocabSection(StringRef Key,
+ const json::Value &ParsedVocabValue,
+ VocabMap &TargetVocab, unsigned &Dim) {
+ json::Path::Root Path("");
+ const json::Object *RootObj = ParsedVocabValue.getAsObject();
+ if (!RootObj)
+ return createStringError(errc::invalid_argument,
+ "JSON root is not an object");
+
+ const json::Value *SectionValue = RootObj->get(Key);
+ if (!SectionValue)
+ return createStringError(errc::invalid_argument,
+ "Missing '" + std::string(Key) +
+ "' section in vocabulary file");
+ if (!json::fromJSON(*SectionValue, TargetVocab, Path))
+ return createStringError(errc::illegal_byte_sequence,
+ "Unable to parse '" + std::string(Key) +
+ "' section from vocabulary");
+
+ Dim = TargetVocab.begin()->second.size();
+ if (Dim == 0)
+ return createStringError(errc::illegal_byte_sequence,
+ "Dimension of '" + std::string(Key) +
+ "' section of the vocabulary is zero");
+
+ if (!std::all_of(TargetVocab.begin(), TargetVocab.end(),
+ [Dim](const std::pair<StringRef, Embedding> &Entry) {
+ return Entry.second.size() == Dim;
+ }))
+ return createStringError(
+ errc::illegal_byte_sequence,
+ "All vectors in the '" + std::string(Key) +
+ "' section of the vocabulary are not of the same dimension");
+
+ return Error::success();
+}
+
// ==----------------------------------------------------------------------===//
// Vocabulary
//===----------------------------------------------------------------------===//
@@ -460,43 +497,6 @@ VocabStorage Vocabulary::createDummyVocabForTest(unsigned Dim) {
// IR2VecVocabAnalysis
//===----------------------------------------------------------------------===//
-Error IR2VecVocabAnalysis::parseVocabSection(
- StringRef Key, const json::Value &ParsedVocabValue, VocabMap &TargetVocab,
- unsigned &Dim) {
- json::Path::Root Path("");
- const json::Object *RootObj = ParsedVocabValue.getAsObject();
- if (!RootObj)
- return createStringError(errc::invalid_argument,
- "JSON root is not an object");
-
- const json::Value *SectionValue = RootObj->get(Key);
- if (!SectionValue)
- return createStringError(errc::invalid_argument,
- "Missing '" + std::string(Key) +
- "' section in vocabulary file");
- if (!json::fromJSON(*SectionValue, TargetVocab, Path))
- return createStringError(errc::illegal_byte_sequence,
- "Unable to parse '" + std::string(Key) +
- "' section from vocabulary");
-
- Dim = TargetVocab.begin()->second.size();
- if (Dim == 0)
- return createStringError(errc::illegal_byte_sequence,
- "Dimension of '" + std::string(Key) +
- "' section of the vocabulary is zero");
-
- if (!std::all_of(TargetVocab.begin(), TargetVocab.end(),
- [Dim](const std::pair<StringRef, Embedding> &Entry) {
- return Entry.second.size() == Dim;
- }))
- return createStringError(
- errc::illegal_byte_sequence,
- "All vectors in the '" + std::string(Key) +
- "' section of the vocabulary are not of the same dimension");
-
- return Error::success();
-}
-
// FIXME: Make this optional. We can avoid file reads
// by auto-generating a default vocabulary during the build time.
Error IR2VecVocabAnalysis::readVocabulary(VocabMap &OpcVocab,
@@ -513,16 +513,16 @@ Error IR2VecVocabAnalysis::readVocabulary(VocabMap &OpcVocab,
return ParsedVocabValue.takeError();
unsigned OpcodeDim = 0, TypeDim = 0, ArgDim = 0;
- if (auto Err =
- parseVocabSection("Opcodes", *ParsedVocabValue, OpcVocab, OpcodeDim))
+ if (auto Err = VocabStorage::parseVocabSection("Opcodes", *ParsedVocabValue,
+ OpcVocab, OpcodeDim))
return Err;
- if (auto Err =
- parseVocabSection("Types", *ParsedVocabValue, TypeVocab, TypeDim))
+ if (auto Err = VocabStorage::parseVocabSection("Types", *ParsedVocabValue,
+ TypeVocab, TypeDim))
return Err;
- if (auto Err =
- parseVocabSection("Arguments", *ParsedVocabValue, ArgVocab, ArgDim))
+ if (auto Err = VocabStorage::parseVocabSection("Arguments", *ParsedVocabValue,
+ ArgVocab, ArgDim))
return Err;
if (!(OpcodeDim == TypeDim && TypeDim == ArgDim))
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index 11602d2..0c1f8db 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -125,24 +125,6 @@ bool llvm::memprof::hasSingleAllocType(uint8_t AllocTypes) {
return NumAllocTypes == 1;
}
-void llvm::memprof::removeAnyExistingAmbiguousAttribute(CallBase *CB) {
- if (!CB->hasFnAttr("memprof"))
- return;
- assert(CB->getFnAttr("memprof").getValueAsString() == "ambiguous");
- CB->removeFnAttr("memprof");
-}
-
-void llvm::memprof::addAmbiguousAttribute(CallBase *CB) {
- // We may have an existing ambiguous attribute if we are reanalyzing
- // after inlining.
- if (CB->hasFnAttr("memprof")) {
- assert(CB->getFnAttr("memprof").getValueAsString() == "ambiguous");
- } else {
- auto A = llvm::Attribute::get(CB->getContext(), "memprof", "ambiguous");
- CB->addFnAttr(A);
- }
-}
-
void CallStackTrie::addCallStack(
AllocationType AllocType, ArrayRef<uint64_t> StackIds,
std::vector<ContextTotalSize> ContextSizeInfo) {
@@ -488,9 +470,6 @@ void CallStackTrie::addSingleAllocTypeAttribute(CallBase *CI, AllocationType AT,
StringRef Descriptor) {
auto AllocTypeString = getAllocTypeAttributeString(AT);
auto A = llvm::Attribute::get(CI->getContext(), "memprof", AllocTypeString);
- // After inlining we may be able to convert an existing ambiguous allocation
- // to an unambiguous one.
- removeAnyExistingAmbiguousAttribute(CI);
CI->addFnAttr(A);
if (MemProfReportHintedSizes) {
std::vector<ContextTotalSize> ContextSizeInfo;
@@ -550,7 +529,6 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) {
assert(MIBCallStack.size() == 1 &&
"Should only be left with Alloc's location in stack");
CI->setMetadata(LLVMContext::MD_memprof, MDNode::get(Ctx, MIBNodes));
- addAmbiguousAttribute(CI);
return true;
}
// If there exists corner case that CallStackTrie has one chain to leaf
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 62fb5eb..3cfe7cc 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1889,11 +1889,12 @@ DIE &DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
bool IsBitfield = DT->isBitField();
// Handle the size.
- if (auto *Var = dyn_cast_or_null<DIVariable>(DT->getRawSizeInBits())) {
+ if (DT->getRawSizeInBits() == nullptr) {
+ // No size, just ignore.
+ } else if (auto *Var = dyn_cast<DIVariable>(DT->getRawSizeInBits())) {
if (auto *VarDIE = getDIE(Var))
addDIEEntry(MemberDie, dwarf::DW_AT_bit_size, *VarDIE);
- } else if (auto *Exp =
- dyn_cast_or_null<DIExpression>(DT->getRawSizeInBits())) {
+ } else if (auto *Exp = dyn_cast<DIExpression>(DT->getRawSizeInBits())) {
DIELoc *Loc = new (DIEValueAllocator) DIELoc;
DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
DwarfExpr.setMemoryLocationKind();
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 8e6cf3e..7fe13a3 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -1406,8 +1406,24 @@ bool RAGreedy::trySplitAroundHintReg(MCPhysReg Hint,
continue;
// Check if VirtReg interferes with OtherReg after this COPY instruction.
- if (!IsDef && VirtReg.liveAt(LIS->getInstructionIndex(Instr).getRegSlot()))
- continue;
+ if (Opnd.readsReg()) {
+ SlotIndex Index = LIS->getInstructionIndex(Instr).getRegSlot();
+
+ if (SubReg) {
+ LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubReg);
+ if (IsDef)
+ Mask = ~Mask;
+
+ if (any_of(VirtReg.subranges(), [=](const LiveInterval::SubRange &S) {
+ return (S.LaneMask & Mask).any() && S.liveAt(Index);
+ })) {
+ continue;
+ }
+ } else {
+ if (VirtReg.liveAt(Index))
+ continue;
+ }
+ }
MCRegister OtherPhysReg =
OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg);
@@ -2419,25 +2435,28 @@ void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) {
unsigned SubReg = Opnd.getSubReg();
// Get the current assignment.
- MCRegister OtherPhysReg =
- OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg);
- if (OtherSubReg) {
- if (OtherReg.isPhysical()) {
- MCRegister Tuple =
- TRI->getMatchingSuperReg(OtherPhysReg, OtherSubReg, RC);
- if (!Tuple)
- continue;
- OtherPhysReg = Tuple;
- } else {
- // TODO: There should be a hinting mechanism for subregisters
- if (SubReg != OtherSubReg)
- continue;
- }
+ MCRegister OtherPhysReg;
+ if (OtherReg.isPhysical()) {
+ if (OtherSubReg)
+ OtherPhysReg = TRI->getMatchingSuperReg(OtherReg, OtherSubReg, RC);
+ else if (SubReg)
+ OtherPhysReg = TRI->getMatchingSuperReg(OtherReg, SubReg, RC);
+ else
+ OtherPhysReg = OtherReg;
+ } else {
+ OtherPhysReg = VRM->getPhys(OtherReg);
+ // TODO: Should find matching superregister, but applying this in the
+ // non-hint case currently causes regressions
+
+ if (SubReg && OtherSubReg && SubReg != OtherSubReg)
+ continue;
}
// Push the collected information.
- Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
- OtherPhysReg));
+ if (OtherPhysReg) {
+ Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
+ OtherPhysReg));
+ }
}
}
@@ -2466,15 +2485,13 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
// We have a broken hint, check if it is possible to fix it by
// reusing PhysReg for the copy-related live-ranges. Indeed, we evicted
// some register and PhysReg may be available for the other live-ranges.
- SmallSet<Register, 4> Visited;
- SmallVector<Register, 2> RecoloringCandidates;
HintsInfo Info;
Register Reg = VirtReg.reg();
MCRegister PhysReg = VRM->getPhys(Reg);
// Start the recoloring algorithm from the input live-interval, then
// it will propagate to the ones that are copy-related with it.
- Visited.insert(Reg);
- RecoloringCandidates.push_back(Reg);
+ SmallSet<Register, 4> Visited = {Reg};
+ SmallVector<Register, 2> RecoloringCandidates = {Reg};
LLVM_DEBUG(dbgs() << "Trying to reconcile hints for: " << printReg(Reg, TRI)
<< '(' << printReg(PhysReg, TRI) << ")\n");
@@ -2482,12 +2499,10 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
do {
Reg = RecoloringCandidates.pop_back_val();
- // We cannot recolor physical register.
- if (Reg.isPhysical())
- continue;
+ MCRegister CurrPhys = VRM->getPhys(Reg);
// This may be a skipped register.
- if (!VRM->hasPhys(Reg)) {
+ if (!CurrPhys) {
assert(!shouldAllocateRegister(Reg) &&
"We have an unallocated variable which should have been handled");
continue;
@@ -2496,7 +2511,6 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
// Get the live interval mapped with this virtual register to be able
// to check for the interference with the new color.
LiveInterval &LI = LIS->getInterval(Reg);
- MCRegister CurrPhys = VRM->getPhys(Reg);
// Check that the new color matches the register class constraints and
// that it is free for this live range.
if (CurrPhys != PhysReg && (!MRI->getRegClass(Reg)->contains(PhysReg) ||
@@ -2533,7 +2547,8 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
// Push all copy-related live-ranges to keep reconciling the broken
// hints.
for (const HintInfo &HI : Info) {
- if (Visited.insert(HI.Reg).second)
+ // We cannot recolor physical register.
+ if (HI.Reg.isVirtual() && Visited.insert(HI.Reg).second)
RecoloringCandidates.push_back(HI.Reg);
}
} while (!RecoloringCandidates.empty());
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index ff7cd66..87d5453 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -6256,17 +6256,17 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
// FIXME: Not all targets may support EVL in VP_LOAD. These will have been
// removed from the IR by the ExpandVectorPredication pass but we're
// reintroducing them here.
- EVT LdVT = LD->getMemoryVT();
- EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), LdVT);
- EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
- WideVT.getVectorElementCount());
+ EVT VT = LD->getValueType(0);
+ EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+ EVT WideMaskVT = getSetCCResultType(WideVT);
+
if (ExtType == ISD::NON_EXTLOAD &&
TLI.isOperationLegalOrCustom(ISD::VP_LOAD, WideVT) &&
TLI.isTypeLegal(WideMaskVT)) {
SDLoc DL(N);
SDValue Mask = DAG.getAllOnesConstant(DL, WideMaskVT);
SDValue EVL = DAG.getElementCount(DL, TLI.getVPExplicitVectorLengthTy(),
- LdVT.getVectorElementCount());
+ VT.getVectorElementCount());
SDValue NewLoad =
DAG.getLoadVP(LD->getAddressingMode(), ISD::NON_EXTLOAD, WideVT, DL,
LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask,
@@ -6303,6 +6303,24 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
return Result;
}
+ if (VT.isVector()) {
+ // If all else fails replace the load with a wide masked load.
+ SDLoc DL(N);
+ EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+
+ SDValue Len = DAG.getElementCount(DL, IdxVT, VT.getVectorElementCount());
+ SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
+ DAG.getConstant(0, DL, IdxVT), Len);
+
+ SDValue NewLoad = DAG.getMaskedLoad(
+ WideVT, DL, LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask,
+ DAG.getPOISON(WideVT), LD->getMemoryVT(), LD->getMemOperand(),
+ LD->getAddressingMode(), LD->getExtensionType());
+
+ ReplaceValueWith(SDValue(N, 1), NewLoad.getValue(1));
+ return NewLoad;
+ }
+
report_fatal_error("Unable to widen vector load");
}
@@ -7516,8 +7534,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
SDValue StVal = ST->getValue();
EVT StVT = StVal.getValueType();
EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StVT);
- EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
- WideVT.getVectorElementCount());
+ EVT WideMaskVT = getSetCCResultType(WideVT);
if (TLI.isOperationLegalOrCustom(ISD::VP_STORE, WideVT) &&
TLI.isTypeLegal(WideMaskVT)) {
@@ -7540,6 +7557,22 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
}
+ if (StVT.isVector()) {
+ // If all else fails replace the store with a wide masked store.
+ SDLoc DL(N);
+ EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+
+ SDValue WideStVal = GetWidenedVector(StVal);
+ SDValue Len = DAG.getElementCount(DL, IdxVT, StVT.getVectorElementCount());
+ SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
+ DAG.getConstant(0, DL, IdxVT), Len);
+
+ return DAG.getMaskedStore(ST->getChain(), DL, WideStVal, ST->getBasePtr(),
+ ST->getOffset(), Mask, ST->getMemoryVT(),
+ ST->getMemOperand(), ST->getAddressingMode(),
+ ST->isTruncatingStore());
+ }
+
report_fatal_error("Unable to widen vector store");
}
@@ -8298,8 +8331,7 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
AAMDNodes AAInfo = LD->getAAInfo();
if (LdVT.isScalableVector())
- report_fatal_error("Generating widen scalable extending vector loads is "
- "not yet supported");
+ return SDValue();
EVT EltVT = WidenVT.getVectorElementType();
EVT LdEltVT = LdVT.getVectorElementType();
diff --git a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
index 096a33c..64e5cd5 100644
--- a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
+++ b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
@@ -72,7 +72,7 @@ struct StackFrameLayoutAnalysis {
: Slot(Idx), Size(MFI.getObjectSize(Idx)),
Align(MFI.getObjectAlign(Idx).value()), Offset(Offset),
SlotTy(Invalid), Scalable(false) {
- Scalable = MFI.getStackID(Idx) == TargetStackID::ScalableVector;
+ Scalable = MFI.hasScalableStackID(Idx);
if (MFI.isSpillSlotObjectIndex(Idx))
SlotTy = SlotType::Spill;
else if (MFI.isFixedObjectIndex(Idx))
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
index 23b72da..6e316f1 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
@@ -280,6 +280,9 @@ std::vector<Block *> LinkGraph::splitBlockImpl(std::vector<Block *> Blocks,
void LinkGraph::dump(raw_ostream &OS) {
DenseMap<Block *, std::vector<Symbol *>> BlockSymbols;
+ OS << "LinkGraph \"" << getName()
+ << "\" (triple = " << getTargetTriple().str() << ")\n";
+
// Map from blocks to the symbols pointing at them.
for (auto *Sym : defined_symbols())
BlockSymbols[&Sym->getBlock()].push_back(Sym);
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
index 584b9f0..17050b0 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
@@ -21,23 +21,21 @@ JITLinkerBase::~JITLinkerBase() = default;
void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
- LLVM_DEBUG({
- dbgs() << "Starting link phase 1 for graph " << G->getName() << "\n";
- });
+ LLVM_DEBUG(dbgs() << "Starting link phase 1\n");
// Prune and optimize the graph.
if (auto Err = runPasses(Passes.PrePrunePasses))
return Ctx->notifyFailed(std::move(Err));
LLVM_DEBUG({
- dbgs() << "Link graph \"" << G->getName() << "\" pre-pruning:\n";
+ dbgs() << "Link graph pre-pruning:\n";
G->dump(dbgs());
});
prune(*G);
LLVM_DEBUG({
- dbgs() << "Link graph \"" << G->getName() << "\" post-pruning:\n";
+ dbgs() << "Link graph post-pruning:\n";
G->dump(dbgs());
});
@@ -67,14 +65,15 @@ void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self,
AllocResult AR) {
+ LLVM_DEBUG(dbgs() << "Starting link phase 2\n");
+
if (AR)
Alloc = std::move(*AR);
else
return Ctx->notifyFailed(AR.takeError());
LLVM_DEBUG({
- dbgs() << "Link graph \"" << G->getName()
- << "\" before post-allocation passes:\n";
+ dbgs() << "Link graph before post-allocation passes:\n";
G->dump(dbgs());
});
@@ -131,9 +130,7 @@ void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self,
void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self,
Expected<AsyncLookupResult> LR) {
- LLVM_DEBUG({
- dbgs() << "Starting link phase 3 for graph " << G->getName() << "\n";
- });
+ LLVM_DEBUG(dbgs() << "Starting link phase 3\n");
// If the lookup failed, bail out.
if (!LR)
@@ -143,8 +140,7 @@ void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self,
applyLookupResult(*LR);
LLVM_DEBUG({
- dbgs() << "Link graph \"" << G->getName()
- << "\" before pre-fixup passes:\n";
+ dbgs() << "Link graph before pre-fixup passes:\n";
G->dump(dbgs());
});
@@ -152,7 +148,7 @@ void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self,
return abandonAllocAndBailOut(std::move(Self), std::move(Err));
LLVM_DEBUG({
- dbgs() << "Link graph \"" << G->getName() << "\" before copy-and-fixup:\n";
+ dbgs() << "Link graph before copy-and-fixup:\n";
G->dump(dbgs());
});
@@ -161,7 +157,7 @@ void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self,
return abandonAllocAndBailOut(std::move(Self), std::move(Err));
LLVM_DEBUG({
- dbgs() << "Link graph \"" << G->getName() << "\" after copy-and-fixup:\n";
+ dbgs() << "Link graph after copy-and-fixup:\n";
G->dump(dbgs());
});
@@ -186,16 +182,14 @@ void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self,
void JITLinkerBase::linkPhase4(std::unique_ptr<JITLinkerBase> Self,
FinalizeResult FR) {
- LLVM_DEBUG({
- dbgs() << "Starting link phase 4 for graph " << G->getName() << "\n";
- });
+ LLVM_DEBUG(dbgs() << "Starting link phase 4\n");
if (!FR)
return Ctx->notifyFailed(FR.takeError());
Ctx->notifyFinalized(std::move(*FR));
- LLVM_DEBUG({ dbgs() << "Link of graph " << G->getName() << " complete\n"; });
+ LLVM_DEBUG({ dbgs() << "Link complete\n"; });
}
Error JITLinkerBase::runPasses(LinkGraphPassList &Passes) {
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index d14abb4..8623c06 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -5857,7 +5857,7 @@ DoubleAPFloat frexp(const DoubleAPFloat &Arg, int &Exp,
// practice.
if (Exp == APFloat::IEK_NaN) {
DoubleAPFloat Quiet{Arg};
- Quiet.getFirst().makeQuiet();
+ Quiet.getFirst() = Quiet.getFirst().makeQuiet();
return Quiet;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 79655e1..0f4bbfc3 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1610,7 +1610,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
int BaseOffset = -AFI->getTaggedBasePointerOffset();
Register FrameReg;
StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference(
- MF, BaseOffset, false /*isFixed*/, false /*isSVE*/, FrameReg,
+ MF, BaseOffset, false /*isFixed*/, TargetStackID::Default /*StackID*/,
+ FrameReg,
/*PreferFP=*/false,
/*ForSimm=*/true);
Register SrcReg = FrameReg;
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index ab5c6f3..8d6eb91 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -56,15 +56,20 @@
// | async context if needed |
// | (a.k.a. "frame record") |
// |-----------------------------------| <- fp(=x29)
-// | <hazard padding> |
-// |-----------------------------------|
-// | |
-// | callee-saved fp/simd/SVE regs |
-// | |
-// |-----------------------------------|
-// | |
-// | SVE stack objects |
-// | |
+// Default SVE stack layout Split SVE objects
+// (aarch64-split-sve-objects=false) (aarch64-split-sve-objects=true)
+// |-----------------------------------| |-----------------------------------|
+// | <hazard padding> | | callee-saved PPR registers |
+// |-----------------------------------| |-----------------------------------|
+// | | | PPR stack objects |
+// | callee-saved fp/simd/SVE regs | |-----------------------------------|
+// | | | <hazard padding> |
+// |-----------------------------------| |-----------------------------------|
+// | | | callee-saved ZPR/FPR registers |
+// | SVE stack objects | |-----------------------------------|
+// | | | ZPR stack objects |
+// |-----------------------------------| |-----------------------------------|
+// ^ NB: FPR CSRs are promoted to ZPRs
// |-----------------------------------|
// |.empty.space.to.make.part.below....|
// |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
@@ -274,6 +279,11 @@ static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
cl::desc("sort stack allocations"),
cl::init(true), cl::Hidden);
+static cl::opt<bool>
+ SplitSVEObjects("aarch64-split-sve-objects",
+ cl::desc("Split allocation of ZPR & PPR objects"),
+ cl::init(false), cl::Hidden);
+
cl::opt<bool> EnableHomogeneousPrologEpilog(
"homogeneous-prolog-epilog", cl::Hidden,
cl::desc("Emit homogeneous prologue and epilogue for the size "
@@ -324,7 +334,41 @@ AArch64FrameLowering::getArgumentStackToRestore(MachineFunction &MF,
static bool produceCompactUnwindFrame(const AArch64FrameLowering &,
MachineFunction &MF);
-// Conservatively, returns true if the function is likely to have an SVE vectors
+enum class AssignObjectOffsets { No, Yes };
+/// Process all the SVE stack objects and the SVE stack size and offsets for
+/// each object. If AssignOffsets is "Yes", the offsets get assigned (and SVE
+/// stack sizes set). Returns the size of the SVE stack.
+static SVEStackSizes determineSVEStackSizes(MachineFunction &MF,
+ AssignObjectOffsets AssignOffsets);
+
+static unsigned getStackHazardSize(const MachineFunction &MF) {
+ return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize();
+}
+
+/// Returns true if PPRs are spilled as ZPRs.
+static bool arePPRsSpilledAsZPR(const MachineFunction &MF) {
+ return MF.getSubtarget().getRegisterInfo()->getSpillSize(
+ AArch64::PPRRegClass) == 16;
+}
+
+StackOffset
+AArch64FrameLowering::getZPRStackSize(const MachineFunction &MF) const {
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ return StackOffset::getScalable(AFI->getStackSizeZPR());
+}
+
+StackOffset
+AArch64FrameLowering::getPPRStackSize(const MachineFunction &MF) const {
+ // With split SVE objects, the hazard padding is added to the PPR region,
+ // which places it between the [GPR, PPR] area and the [ZPR, FPR] area. This
+ // avoids hazards between both GPRs and FPRs and ZPRs and PPRs.
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ return StackOffset::get(AFI->hasSplitSVEObjects() ? getStackHazardSize(MF)
+ : 0,
+ AFI->getStackSizePPR());
+}
+
+// Conservatively, returns true if the function is likely to have SVE vectors
// on the stack. This function is safe to be called before callee-saves or
// object offsets have been determined.
static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL,
@@ -338,7 +382,7 @@ static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL,
const MachineFrameInfo &MFI = MF.getFrameInfo();
for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd(); FI++) {
- if (MFI.getStackID(FI) == TargetStackID::ScalableVector)
+ if (MFI.hasScalableStackID(FI))
return true;
}
@@ -482,13 +526,6 @@ AArch64FrameLowering::getFixedObjectSize(const MachineFunction &MF,
}
}
-/// Returns the size of the entire SVE stackframe (calleesaves + spills).
-StackOffset
-AArch64FrameLowering::getSVEStackSize(const MachineFunction &MF) const {
- const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
-}
-
bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
if (!EnableRedZone)
return false;
@@ -514,7 +551,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
!Subtarget.hasSVE();
return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
- getSVEStackSize(MF) || LowerQRegCopyThroughMem);
+ AFI->hasSVEStackSize() || LowerQRegCopyThroughMem);
}
/// hasFPImpl - Return true if the specified function should have a dedicated
@@ -557,7 +594,7 @@ bool AArch64FrameLowering::hasFPImpl(const MachineFunction &MF) const {
// CFA in either of these cases.
if (AFI.needsDwarfUnwindInfo(MF) &&
((requiresSaveVG(MF) || AFI.getSMEFnAttrs().hasStreamingBody()) &&
- (!AFI.hasCalculatedStackSizeSVE() || AFI.getStackSizeSVE() > 0)))
+ (!AFI.hasCalculatedStackSizeSVE() || AFI.hasSVEStackSize())))
return true;
// With large callframes around we may need to use FP to access the scavenging
// emergency spillslot.
@@ -1126,10 +1163,6 @@ static bool isTargetWindows(const MachineFunction &MF) {
return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
}
-static unsigned getStackHazardSize(const MachineFunction &MF) {
- return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize();
-}
-
void AArch64FrameLowering::emitPacRetPlusLeafHardening(
MachineFunction &MF) const {
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
@@ -1212,7 +1245,9 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
const auto &MFI = MF.getFrameInfo();
int64_t ObjectOffset = MFI.getObjectOffset(FI);
- StackOffset SVEStackSize = getSVEStackSize(MF);
+ StackOffset ZPRStackSize = getZPRStackSize(MF);
+ StackOffset PPRStackSize = getPPRStackSize(MF);
+ StackOffset SVEStackSize = ZPRStackSize + PPRStackSize;
// For VLA-area objects, just emit an offset at the end of the stack frame.
// Whilst not quite correct, these objects do live at the end of the frame and
@@ -1228,11 +1263,21 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
bool FPAfterSVECalleeSaves =
isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize();
- if (MFI.getStackID(FI) == TargetStackID::ScalableVector) {
+ if (MFI.hasScalableStackID(FI)) {
if (FPAfterSVECalleeSaves &&
- -ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize())
+ -ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) {
+ assert(!AFI->hasSplitSVEObjects() &&
+ "split-sve-objects not supported with FPAfterSVECalleeSaves");
return StackOffset::getScalable(ObjectOffset);
- return StackOffset::get(-((int64_t)AFI->getCalleeSavedStackSize()),
+ }
+ StackOffset AccessOffset{};
+ // The scalable vectors are below (lower address) the scalable predicates
+ // with split SVE objects, so we must subtract the size of the predicates.
+ if (AFI->hasSplitSVEObjects() &&
+ MFI.getStackID(FI) == TargetStackID::ScalableVector)
+ AccessOffset = -PPRStackSize;
+ return AccessOffset +
+ StackOffset::get(-((int64_t)AFI->getCalleeSavedStackSize()),
ObjectOffset);
}
@@ -1294,14 +1339,15 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference(
const auto &MFI = MF.getFrameInfo();
int64_t ObjectOffset = MFI.getObjectOffset(FI);
bool isFixed = MFI.isFixedObjectIndex(FI);
- bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
- return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
- PreferFP, ForSimm);
+ auto StackID = static_cast<TargetStackID::Value>(MFI.getStackID(FI));
+ return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, StackID,
+ FrameReg, PreferFP, ForSimm);
}
StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
- const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
- Register &FrameReg, bool PreferFP, bool ForSimm) const {
+ const MachineFunction &MF, int64_t ObjectOffset, bool isFixed,
+ TargetStackID::Value StackID, Register &FrameReg, bool PreferFP,
+ bool ForSimm) const {
const auto &MFI = MF.getFrameInfo();
const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
MF.getSubtarget().getRegisterInfo());
@@ -1312,8 +1358,11 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
bool isCSR =
!isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
+ bool isSVE = MFI.isScalableStackID(StackID);
- const StackOffset &SVEStackSize = getSVEStackSize(MF);
+ StackOffset ZPRStackSize = getZPRStackSize(MF);
+ StackOffset PPRStackSize = getPPRStackSize(MF);
+ StackOffset SVEStackSize = ZPRStackSize + PPRStackSize;
// Use frame pointer to reference fixed objects. Use it for locals if
// there are VLAs or a dynamically realigned SP (and thus the SP isn't
@@ -1388,12 +1437,25 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize();
if (isSVE) {
- StackOffset FPOffset =
- StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
+ StackOffset FPOffset = StackOffset::get(
+ -AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
StackOffset SPOffset =
SVEStackSize +
StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
ObjectOffset);
+
+ // With split SVE objects the ObjectOffset is relative to the split area
+ // (i.e. the PPR area or ZPR area respectively).
+ if (AFI->hasSplitSVEObjects() && StackID == TargetStackID::ScalableVector) {
+ // If we're accessing an SVE vector with split SVE objects...
+ // - From the FP we need to move down past the PPR area:
+ FPOffset -= PPRStackSize;
+ // - From the SP we only need to move up to the ZPR area:
+ SPOffset -= PPRStackSize;
+ // Note: `SPOffset = SVEStackSize + ...`, so `-= PPRStackSize` results in
+ // `SPOffset = ZPRStackSize + ...`.
+ }
+
if (FPAfterSVECalleeSaves) {
FPOffset += StackOffset::getScalable(AFI->getSVECalleeSavedStackSize());
if (-ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) {
@@ -1401,6 +1463,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
SPOffset += StackOffset::getFixed(AFI->getCalleeSavedStackSize());
}
}
+
// Always use the FP for SVE spills if available and beneficial.
if (hasFP(MF) && (SPOffset.getFixed() ||
FPOffset.getScalable() < SPOffset.getScalable() ||
@@ -1408,13 +1471,13 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
FrameReg = RegInfo->getFrameRegister(MF);
return FPOffset;
}
-
FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
: (unsigned)AArch64::SP;
+
return SPOffset;
}
- StackOffset ScalableOffset = {};
+ StackOffset SVEAreaOffset = {};
if (FPAfterSVECalleeSaves) {
// In this stack layout, the FP is in between the callee saves and other
// SVE allocations.
@@ -1422,25 +1485,25 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
StackOffset::getScalable(AFI->getSVECalleeSavedStackSize());
if (UseFP) {
if (isFixed)
- ScalableOffset = SVECalleeSavedStack;
+ SVEAreaOffset = SVECalleeSavedStack;
else if (!isCSR)
- ScalableOffset = SVECalleeSavedStack - SVEStackSize;
+ SVEAreaOffset = SVECalleeSavedStack - SVEStackSize;
} else {
if (isFixed)
- ScalableOffset = SVEStackSize;
+ SVEAreaOffset = SVEStackSize;
else if (isCSR)
- ScalableOffset = SVEStackSize - SVECalleeSavedStack;
+ SVEAreaOffset = SVEStackSize - SVECalleeSavedStack;
}
} else {
if (UseFP && !(isFixed || isCSR))
- ScalableOffset = -SVEStackSize;
+ SVEAreaOffset = -SVEStackSize;
if (!UseFP && (isFixed || isCSR))
- ScalableOffset = SVEStackSize;
+ SVEAreaOffset = SVEStackSize;
}
if (UseFP) {
FrameReg = RegInfo->getFrameRegister(MF);
- return StackOffset::getFixed(FPOffset) + ScalableOffset;
+ return StackOffset::getFixed(FPOffset) + SVEAreaOffset;
}
// Use the base pointer if we have one.
@@ -1457,7 +1520,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
Offset -= AFI->getLocalStackSize();
}
- return StackOffset::getFixed(Offset) + ScalableOffset;
+ return StackOffset::getFixed(Offset) + SVEAreaOffset;
}
static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
@@ -1614,11 +1677,25 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL,
RegInc = -1;
FirstReg = Count - 1;
}
+
bool FPAfterSVECalleeSaves = IsWindows && AFI->getSVECalleeSavedStackSize();
- int ScalableByteOffset =
- FPAfterSVECalleeSaves ? 0 : AFI->getSVECalleeSavedStackSize();
+
+ int ZPRByteOffset = 0;
+ int PPRByteOffset = 0;
+ bool SplitPPRs = AFI->hasSplitSVEObjects();
+ if (SplitPPRs) {
+ ZPRByteOffset = AFI->getZPRCalleeSavedStackSize();
+ PPRByteOffset = AFI->getPPRCalleeSavedStackSize();
+ } else if (!FPAfterSVECalleeSaves) {
+ ZPRByteOffset =
+ AFI->getZPRCalleeSavedStackSize() + AFI->getPPRCalleeSavedStackSize();
+ // Unused: Everything goes in ZPR space.
+ PPRByteOffset = 0;
+ }
+
bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
Register LastReg = 0;
+ bool HasCSHazardPadding = AFI->hasStackHazardSlotIndex() && !SplitPPRs;
// When iterating backwards, the loop condition relies on unsigned wraparound.
for (unsigned i = FirstReg; i < Count; i += RegInc) {
@@ -1647,8 +1724,12 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL,
llvm_unreachable("Unsupported register class.");
}
+ int &ScalableByteOffset = RPI.Type == RegPairInfo::PPR && SplitPPRs
+ ? PPRByteOffset
+ : ZPRByteOffset;
+
// Add the stack hazard size as we transition from GPR->FPR CSRs.
- if (AFI->hasStackHazardSlotIndex() &&
+ if (HasCSHazardPadding &&
(!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) &&
AArch64InstrInfo::isFpOrNEON(RPI.Reg1))
ByteOffset += StackFillDir * StackHazardSize;
@@ -1656,7 +1737,7 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL,
int Scale = TRI->getSpillSize(*RPI.RC);
// Add the next reg to the pair if it is in the same register class.
- if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) {
+ if (unsigned(i + RegInc) < Count && !HasCSHazardPadding) {
MCRegister NextReg = CSI[i + RegInc].getReg();
bool IsFirst = i == FirstReg;
switch (RPI.Type) {
@@ -2021,10 +2102,14 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
}
// Update the StackIDs of the SVE stack slots.
MachineFrameInfo &MFI = MF.getFrameInfo();
- if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) {
+ if (RPI.Type == RegPairInfo::ZPR) {
MFI.setStackID(FrameIdxReg1, TargetStackID::ScalableVector);
if (RPI.isPaired())
MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
+ } else if (RPI.Type == RegPairInfo::PPR) {
+ MFI.setStackID(FrameIdxReg1, TargetStackID::ScalablePredicateVector);
+ if (RPI.isPaired())
+ MFI.setStackID(FrameIdxReg2, TargetStackID::ScalablePredicateVector);
}
}
return true;
@@ -2199,6 +2284,13 @@ static std::optional<int> getLdStFrameID(const MachineInstr &MI,
return getMMOFrameID(*MI.memoperands_begin(), MFI);
}
+// Returns true if the LDST MachineInstr \p MI is a PPR access.
+static bool isPPRAccess(const MachineInstr &MI) {
+ return MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
+ MI.getOpcode() != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO &&
+ AArch64::PPRRegClass.contains(MI.getOperand(0).getReg());
+}
+
// Check if a Hazard slot is needed for the current function, and if so create
// one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex,
// which can be used to determine if any hazard padding is needed.
@@ -2222,26 +2314,50 @@ void AArch64FrameLowering::determineStackHazardSlot(
bool HasFPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) {
return AArch64::FPR64RegClass.contains(Reg) ||
AArch64::FPR128RegClass.contains(Reg) ||
- AArch64::ZPRRegClass.contains(Reg) ||
- AArch64::PPRRegClass.contains(Reg);
+ AArch64::ZPRRegClass.contains(Reg);
+ });
+ bool HasPPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) {
+ return AArch64::PPRRegClass.contains(Reg);
});
bool HasFPRStackObjects = false;
- if (!HasFPRCSRs) {
- std::vector<unsigned> FrameObjects(MFI.getObjectIndexEnd());
+ bool HasPPRStackObjects = false;
+ if (!HasFPRCSRs || SplitSVEObjects) {
+ enum SlotType : uint8_t {
+ Unknown = 0,
+ ZPRorFPR = 1 << 0,
+ PPR = 1 << 1,
+ GPR = 1 << 2,
+ LLVM_MARK_AS_BITMASK_ENUM(GPR)
+ };
+
+ // Find stack slots solely used for one kind of register (ZPR, PPR, etc.),
+ // based on the kinds of accesses used in the function.
+ SmallVector<SlotType> SlotTypes(MFI.getObjectIndexEnd(), SlotType::Unknown);
for (auto &MBB : MF) {
for (auto &MI : MBB) {
std::optional<int> FI = getLdStFrameID(MI, MFI);
- if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) {
- if (MFI.getStackID(*FI) == TargetStackID::ScalableVector ||
- AArch64InstrInfo::isFpOrNEON(MI))
- FrameObjects[*FI] |= 2;
- else
- FrameObjects[*FI] |= 1;
+ if (!FI || FI < 0 || FI > int(SlotTypes.size()))
+ continue;
+ if (MFI.hasScalableStackID(*FI)) {
+ SlotTypes[*FI] |=
+ isPPRAccess(MI) ? SlotType::PPR : SlotType::ZPRorFPR;
+ } else {
+ SlotTypes[*FI] |= AArch64InstrInfo::isFpOrNEON(MI)
+ ? SlotType::ZPRorFPR
+ : SlotType::GPR;
}
}
}
- HasFPRStackObjects =
- any_of(FrameObjects, [](unsigned B) { return (B & 3) == 2; });
+
+ for (int FI = 0; FI < int(SlotTypes.size()); ++FI) {
+ HasFPRStackObjects |= SlotTypes[FI] == SlotType::ZPRorFPR;
+ // For SplitSVEObjects remember that this stack slot is a predicate, this
+ // will be needed later when determining the frame layout.
+ if (SlotTypes[FI] == SlotType::PPR) {
+ MFI.setStackID(FI, TargetStackID::ScalablePredicateVector);
+ HasPPRStackObjects = true;
+ }
+ }
}
if (HasFPRCSRs || HasFPRStackObjects) {
@@ -2250,6 +2366,78 @@ void AArch64FrameLowering::determineStackHazardSlot(
<< StackHazardSize << "\n");
AFI->setStackHazardSlotIndex(ID);
}
+
+ // Determine if we should use SplitSVEObjects. This should only be used if
+ // there's a possibility of a stack hazard between PPRs and ZPRs or FPRs.
+ if (SplitSVEObjects) {
+ if (!HasPPRCSRs && !HasPPRStackObjects) {
+ LLVM_DEBUG(
+ dbgs() << "Not using SplitSVEObjects as no PPRs are on the stack\n");
+ return;
+ }
+
+ if (!HasFPRCSRs && !HasFPRStackObjects) {
+ LLVM_DEBUG(
+ dbgs()
+ << "Not using SplitSVEObjects as no FPRs or ZPRs are on the stack\n");
+ return;
+ }
+
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ if (MFI.hasVarSizedObjects() || TRI->hasStackRealignment(MF)) {
+ LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with variable "
+ "sized objects or realignment\n");
+ return;
+ }
+
+ if (arePPRsSpilledAsZPR(MF)) {
+ LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with "
+ "-aarch64-enable-zpr-predicate-spills");
+ return;
+ }
+
+ // If another calling convention is explicitly set FPRs can't be promoted to
+ // ZPR callee-saves.
+ if (!is_contained({CallingConv::C, CallingConv::Fast,
+ CallingConv::AArch64_SVE_VectorCall},
+ MF.getFunction().getCallingConv())) {
+ LLVM_DEBUG(
+ dbgs() << "Calling convention is not supported with SplitSVEObjects");
+ return;
+ }
+
+ [[maybe_unused]] const AArch64Subtarget &Subtarget =
+ MF.getSubtarget<AArch64Subtarget>();
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+ "Expected SVE to be available for PPRs");
+
+ // With SplitSVEObjects the CS hazard padding is placed between the
+ // PPRs and ZPRs. If there are any FPR CS there would be a hazard between
+ // them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs.
+ BitVector FPRZRegs(SavedRegs.size());
+ for (size_t Reg = 0, E = SavedRegs.size(); HasFPRCSRs && Reg < E; ++Reg) {
+ BitVector::reference RegBit = SavedRegs[Reg];
+ if (!RegBit)
+ continue;
+ unsigned SubRegIdx = 0;
+ if (AArch64::FPR64RegClass.contains(Reg))
+ SubRegIdx = AArch64::dsub;
+ else if (AArch64::FPR128RegClass.contains(Reg))
+ SubRegIdx = AArch64::zsub;
+ else
+ continue;
+ // Clear the bit for the FPR save.
+ RegBit = false;
+ // Mark that we should save the corresponding ZPR.
+ Register ZReg =
+ TRI->getMatchingSuperReg(Reg, SubRegIdx, &AArch64::ZPRRegClass);
+ FPRZRegs.set(ZReg);
+ }
+ SavedRegs |= FPRZRegs;
+
+ AFI->setSplitSVEObjects(true);
+ LLVM_DEBUG(dbgs() << "SplitSVEObjects enabled!\n");
+ }
}
void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
@@ -2260,10 +2448,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
if (MF.getFunction().getCallingConv() == CallingConv::GHC)
return;
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
MF.getSubtarget().getRegisterInfo());
- const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
unsigned UnspilledCSGPR = AArch64::NoRegister;
unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
@@ -2382,17 +2571,26 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
SavedRegs.set(AArch64::X18);
}
+ // Determine if a Hazard slot should be used and where it should go.
+ // If SplitSVEObjects is used, the hazard padding is placed between the PPRs
+ // and ZPRs. Otherwise, it goes in the callee save area.
+ determineStackHazardSlot(MF, SavedRegs);
+
// Calculates the callee saved stack size.
unsigned CSStackSize = 0;
- unsigned SVECSStackSize = 0;
+ unsigned ZPRCSStackSize = 0;
+ unsigned PPRCSStackSize = 0;
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
for (unsigned Reg : SavedRegs.set_bits()) {
auto *RC = TRI->getMinimalPhysRegClass(Reg);
assert(RC && "expected register class!");
auto SpillSize = TRI->getSpillSize(*RC);
- if (AArch64::PPRRegClass.contains(Reg) ||
- AArch64::ZPRRegClass.contains(Reg))
- SVECSStackSize += SpillSize;
+ bool IsZPR = AArch64::ZPRRegClass.contains(Reg);
+ bool IsPPR = !IsZPR && AArch64::PPRRegClass.contains(Reg);
+ if (IsZPR || (IsPPR && arePPRsSpilledAsZPR(MF)))
+ ZPRCSStackSize += SpillSize;
+ else if (IsPPR)
+ PPRCSStackSize += SpillSize;
else
CSStackSize += SpillSize;
}
@@ -2402,17 +2600,15 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// only 64-bit GPRs can be added to SavedRegs.
unsigned NumSavedRegs = SavedRegs.count();
+ // If we have hazard padding in the CS area add that to the size.
+ if (AFI->isStackHazardIncludedInCalleeSaveArea())
+ CSStackSize += getStackHazardSize(MF);
+
// Increase the callee-saved stack size if the function has streaming mode
// changes, as we will need to spill the value of the VG register.
if (requiresSaveVG(MF))
CSStackSize += 8;
- // Determine if a Hazard slot should be used, and increase the CSStackSize by
- // StackHazardSize if so.
- determineStackHazardSlot(MF, SavedRegs);
- if (AFI->hasStackHazardSlotIndex())
- CSStackSize += getStackHazardSize(MF);
-
// If we must call __arm_get_current_vg in the prologue preserve the LR.
if (requiresSaveVG(MF) && !Subtarget.hasSVE())
SavedRegs.set(AArch64::LR);
@@ -2433,8 +2629,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
});
// If any callee-saved registers are used, the frame cannot be eliminated.
- int64_t SVEStackSize =
- alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
+ auto [ZPRLocalStackSize, PPRLocalStackSize] =
+ determineSVEStackSizes(MF, AssignObjectOffsets::No);
+ uint64_t SVELocals = ZPRLocalStackSize + PPRLocalStackSize;
+ uint64_t SVEStackSize =
+ alignTo(ZPRCSStackSize + PPRCSStackSize + SVELocals, 16);
bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
// The CSR spill slots have not been allocated yet, so estimateStackSize
@@ -2519,7 +2718,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// instructions.
AFI->setCalleeSavedStackSize(AlignedCSStackSize);
AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
- AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
+ AFI->setSVECalleeSavedStackSize(ZPRCSStackSize, alignTo(PPRCSStackSize, 16));
}
bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
@@ -2572,7 +2771,7 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
// Create a hazard slot as we switch between GPR and FPR CSRs.
- if (AFI->hasStackHazardSlotIndex() &&
+ if (AFI->isStackHazardIncludedInCalleeSaveArea() &&
(!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) &&
AArch64InstrInfo::isFpOrNEON(Reg)) {
assert(HazardSlotIndex == std::numeric_limits<int>::max() &&
@@ -2611,7 +2810,7 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
}
// Add hazard slot in the case where no FPR CSRs are present.
- if (AFI->hasStackHazardSlotIndex() &&
+ if (AFI->isStackHazardIncludedInCalleeSaveArea() &&
HazardSlotIndex == std::numeric_limits<int>::max()) {
HazardSlotIndex = MFI.CreateStackObject(StackHazardSize, Align(8), true);
LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex
@@ -2658,7 +2857,6 @@ static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
assert((Max == std::numeric_limits<int>::min() ||
Max + 1 == CS.getFrameIdx()) &&
"SVE CalleeSaves are not consecutive");
-
Min = std::min(Min, CS.getFrameIdx());
Max = std::max(Max, CS.getFrameIdx());
}
@@ -2666,43 +2864,64 @@ static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
return Min != std::numeric_limits<int>::max();
}
-// Process all the SVE stack objects and determine offsets for each
-// object. If AssignOffsets is true, the offsets get assigned.
-// Fills in the first and last callee-saved frame indices into
-// Min/MaxCSFrameIndex, respectively.
-// Returns the size of the stack.
-static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
- int &MinCSFrameIndex,
- int &MaxCSFrameIndex,
- bool AssignOffsets) {
+static SVEStackSizes determineSVEStackSizes(MachineFunction &MF,
+ AssignObjectOffsets AssignOffsets) {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+
+ SVEStackSizes SVEStack{};
+
+ // With SplitSVEObjects we maintain separate stack offsets for predicates
+ // (PPRs) and SVE vectors (ZPRs). When SplitSVEObjects is disabled predicates
+ // are included in the SVE vector area.
+ uint64_t &ZPRStackTop = SVEStack.ZPRStackSize;
+ uint64_t &PPRStackTop =
+ AFI->hasSplitSVEObjects() ? SVEStack.PPRStackSize : SVEStack.ZPRStackSize;
+
#ifndef NDEBUG
// First process all fixed stack objects.
for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
- assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
+ assert(!MFI.hasScalableStackID(I) &&
"SVE vectors should never be passed on the stack by value, only by "
"reference.");
#endif
- auto Assign = [&MFI](int FI, int64_t Offset) {
+ auto AllocateObject = [&](int FI) {
+ uint64_t &StackTop = MFI.getStackID(FI) == TargetStackID::ScalableVector
+ ? ZPRStackTop
+ : PPRStackTop;
+
+ // FIXME: Given that the length of SVE vectors is not necessarily a power of
+ // two, we'd need to align every object dynamically at runtime if the
+ // alignment is larger than 16. This is not yet supported.
+ Align Alignment = MFI.getObjectAlign(FI);
+ if (Alignment > Align(16))
+ report_fatal_error(
+ "Alignment of scalable vectors > 16 bytes is not yet supported");
+
+ StackTop += MFI.getObjectSize(FI);
+ StackTop = alignTo(StackTop, Alignment);
+
+ assert(StackTop < std::numeric_limits<int64_t>::max() &&
+ "SVE StackTop far too large?!");
+
+ int64_t Offset = -int64_t(StackTop);
+ if (AssignOffsets == AssignObjectOffsets::Yes)
+ MFI.setObjectOffset(FI, Offset);
+
LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
- MFI.setObjectOffset(FI, Offset);
};
- int64_t Offset = 0;
-
// Then process all callee saved slots.
+ int MinCSFrameIndex, MaxCSFrameIndex;
if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
- // Assign offsets to the callee save slots.
- for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
- Offset += MFI.getObjectSize(I);
- Offset = alignTo(Offset, MFI.getObjectAlign(I));
- if (AssignOffsets)
- Assign(I, -Offset);
- }
+ for (int FI = MinCSFrameIndex; FI <= MaxCSFrameIndex; ++FI)
+ AllocateObject(FI);
}
- // Ensure that the Callee-save area is aligned to 16bytes.
- Offset = alignTo(Offset, Align(16U));
+ // Ensure the CS area is 16-byte aligned.
+ PPRStackTop = alignTo(PPRStackTop, Align(16U));
+ ZPRStackTop = alignTo(ZPRStackTop, Align(16U));
// Create a buffer of SVE objects to allocate and sort it.
SmallVector<int, 8> ObjectsToAllocate;
@@ -2715,48 +2934,31 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
if (MFI.getStackID(StackProtectorFI) == TargetStackID::ScalableVector)
ObjectsToAllocate.push_back(StackProtectorFI);
}
- for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
- unsigned StackID = MFI.getStackID(I);
- if (StackID != TargetStackID::ScalableVector)
- continue;
- if (I == StackProtectorFI)
+
+ for (int FI = 0, E = MFI.getObjectIndexEnd(); FI != E; ++FI) {
+ if (FI == StackProtectorFI || MFI.isDeadObjectIndex(FI))
continue;
- if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
+ if (MaxCSFrameIndex >= FI && FI >= MinCSFrameIndex)
continue;
- if (MFI.isDeadObjectIndex(I))
+
+ if (MFI.getStackID(FI) != TargetStackID::ScalableVector &&
+ MFI.getStackID(FI) != TargetStackID::ScalablePredicateVector)
continue;
- ObjectsToAllocate.push_back(I);
+ ObjectsToAllocate.push_back(FI);
}
// Allocate all SVE locals and spills
- for (unsigned FI : ObjectsToAllocate) {
- Align Alignment = MFI.getObjectAlign(FI);
- // FIXME: Given that the length of SVE vectors is not necessarily a power of
- // two, we'd need to align every object dynamically at runtime if the
- // alignment is larger than 16. This is not yet supported.
- if (Alignment > Align(16))
- report_fatal_error(
- "Alignment of scalable vectors > 16 bytes is not yet supported");
-
- Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
- if (AssignOffsets)
- Assign(FI, -Offset);
- }
+ for (unsigned FI : ObjectsToAllocate)
+ AllocateObject(FI);
- return Offset;
-}
+ PPRStackTop = alignTo(PPRStackTop, Align(16U));
+ ZPRStackTop = alignTo(ZPRStackTop, Align(16U));
-int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
- MachineFrameInfo &MFI) const {
- int MinCSFrameIndex, MaxCSFrameIndex;
- return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
-}
+ if (AssignOffsets == AssignObjectOffsets::Yes)
+ AFI->setStackSizeSVE(SVEStack.ZPRStackSize, SVEStack.PPRStackSize);
-int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
- MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
- return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
- true);
+ return SVEStack;
}
/// Attempts to scavenge a register from \p ScavengeableRegs given the used
@@ -3070,12 +3272,7 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
"Upwards growing stack unsupported");
- int MinCSFrameIndex, MaxCSFrameIndex;
- int64_t SVEStackSize =
- assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
-
- AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
- AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
+ (void)determineSVEStackSizes(MF, AssignObjectOffsets::Yes);
// If this function isn't doing Win64-style C++ EH, we don't need to do
// anything.
@@ -3359,7 +3556,8 @@ void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
Register Reg;
FrameRegOffset = TFI->resolveFrameOffsetReference(
- *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
+ *MF, FirstTagStore.Offset, false /*isFixed*/,
+ TargetStackID::Default /*StackID*/, Reg,
/*PreferFP=*/false, /*ForSimm=*/true);
FrameReg = Reg;
FrameRegUpdate = std::nullopt;
@@ -3597,7 +3795,7 @@ StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
// Go to common code if we cannot provide sp + offset.
if (MFI.hasVarSizedObjects() ||
- MF.getInfo<AArch64FunctionInfo>()->getStackSizeSVE() ||
+ MF.getInfo<AArch64FunctionInfo>()->hasSVEStackSize() ||
MF.getSubtarget().getRegisterInfo()->hasStackRealignment(MF))
return getFrameIndexReference(MF, FI, FrameReg);
@@ -3699,10 +3897,12 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
void AArch64FrameLowering::orderFrameObjects(
const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
- if (!OrderFrameObjects || ObjectsToAllocate.empty())
+ const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
+
+ if ((!OrderFrameObjects && !AFI.hasSplitSVEObjects()) ||
+ ObjectsToAllocate.empty())
return;
- const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
const MachineFrameInfo &MFI = MF.getFrameInfo();
std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
for (auto &Obj : ObjectsToAllocate) {
@@ -4080,7 +4280,7 @@ void AArch64FrameLowering::emitRemarks(
}
unsigned RegTy = StackAccess::AccessType::GPR;
- if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) {
+ if (MFI.hasScalableStackID(FrameIdx)) {
// SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO
// spill/fill the predicate as a data vector (so are an FPR access).
if (MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 7bba053..32a9bd8 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -24,6 +24,11 @@ class AArch64FunctionInfo;
class AArch64PrologueEmitter;
class AArch64EpilogueEmitter;
+struct SVEStackSizes {
+ uint64_t ZPRStackSize{0};
+ uint64_t PPRStackSize{0};
+};
+
class AArch64FrameLowering : public TargetFrameLowering {
public:
explicit AArch64FrameLowering()
@@ -64,8 +69,9 @@ public:
bool ForSimm) const;
StackOffset resolveFrameOffsetReference(const MachineFunction &MF,
int64_t ObjectOffset, bool isFixed,
- bool isSVE, Register &FrameReg,
- bool PreferFP, bool ForSimm) const;
+ TargetStackID::Value StackID,
+ Register &FrameReg, bool PreferFP,
+ bool ForSimm) const;
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
ArrayRef<CalleeSavedInfo> CSI,
@@ -124,6 +130,7 @@ public:
return false;
case TargetStackID::Default:
case TargetStackID::ScalableVector:
+ case TargetStackID::ScalablePredicateVector:
case TargetStackID::NoAlloc:
return true;
}
@@ -132,7 +139,8 @@ public:
bool isStackIdSafeForLocalArea(unsigned StackId) const override {
// We don't support putting SVE objects into the pre-allocated local
// frame block at the moment.
- return StackId != TargetStackID::ScalableVector;
+ return (StackId != TargetStackID::ScalableVector &&
+ StackId != TargetStackID::ScalablePredicateVector);
}
void
@@ -145,7 +153,17 @@ public:
bool requiresSaveVG(const MachineFunction &MF) const;
- StackOffset getSVEStackSize(const MachineFunction &MF) const;
+ /// Returns the size of the entire ZPR stackframe (calleesaves + spills).
+ StackOffset getZPRStackSize(const MachineFunction &MF) const;
+
+ /// Returns the size of the entire PPR stackframe (calleesaves + spills +
+ /// hazard padding).
+ StackOffset getPPRStackSize(const MachineFunction &MF) const;
+
+ /// Returns the size of the entire SVE stackframe (PPRs + ZPRs).
+ StackOffset getSVEStackSize(const MachineFunction &MF) const {
+ return getZPRStackSize(MF) + getPPRStackSize(MF);
+ }
friend class AArch64PrologueEpilogueCommon;
friend class AArch64PrologueEmitter;
@@ -165,10 +183,6 @@ private:
/// Returns true if CSRs should be paired.
bool producePairRegisters(MachineFunction &MF) const;
- int64_t estimateSVEStackObjectOffsets(MachineFrameInfo &MF) const;
- int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
- int &MinCSFrameIndex,
- int &MaxCSFrameIndex) const;
/// Make a determination whether a Hazard slot is used and create it if
/// needed.
void determineStackHazardSlot(MachineFunction &MF,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 177b4b0..e7b2d20 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -7497,7 +7497,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
int FI = cast<FrameIndexSDNode>(N)->getIndex();
// We can only encode VL scaled offsets, so only fold in frame indexes
// referencing SVE objects.
- if (MFI.getStackID(FI) == TargetStackID::ScalableVector) {
+ if (MFI.hasScalableStackID(FI)) {
Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
return true;
@@ -7543,7 +7543,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
int FI = cast<FrameIndexSDNode>(Base)->getIndex();
// We can only encode VL scaled offsets, so only fold in frame indexes
// referencing SVE objects.
- if (MFI.getStackID(FI) == TargetStackID::ScalableVector)
+ if (MFI.hasScalableStackID(FI))
Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a1f4734..70d5ad7d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1537,6 +1537,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT, VT, Custom);
setOperationAction(ISD::FP_TO_SINT, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom);
+ setOperationAction(ISD::MSTORE, VT, Legal);
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::MULHS, VT, Custom);
setOperationAction(ISD::MULHU, VT, Custom);
@@ -6617,7 +6618,6 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
"llvm.eh.recoverfp must take a function as the first argument");
return IncomingFPOp;
}
-
case Intrinsic::aarch64_neon_vsri:
case Intrinsic::aarch64_neon_vsli:
case Intrinsic::aarch64_sve_sri:
@@ -9256,8 +9256,7 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
(MI.getOpcode() == AArch64::ADDXri ||
MI.getOpcode() == AArch64::SUBXri)) {
const MachineOperand &MO = MI.getOperand(1);
- if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
- TargetStackID::ScalableVector)
+ if (MO.isFI() && MF.getFrameInfo().hasScalableStackID(MO.getIndex()))
MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
/*IsImplicit=*/true));
}
@@ -9704,8 +9703,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
MachineFrameInfo &MFI = MF.getFrameInfo();
int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
- if (isScalable)
- MFI.setStackID(FI, TargetStackID::ScalableVector);
+ if (isScalable) {
+ bool IsPred = VA.getValVT() == MVT::aarch64svcount ||
+ VA.getValVT().getVectorElementType() == MVT::i1;
+ MFI.setStackID(FI, IsPred ? TargetStackID::ScalablePredicateVector
+ : TargetStackID::ScalableVector);
+ }
MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
SDValue Ptr = DAG.getFrameIndex(
@@ -15154,9 +15157,7 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
: Shift.getOperand(1);
unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
- SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
-
- return ResultSLI;
+ return DAG.getNode(Inst, DL, VT, X, Y, Imm);
}
static SDValue tryLowerToBSL(SDValue N, SelectionDAG &DAG) {
@@ -29607,7 +29608,7 @@ void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
// than doing it here in finalizeLowering.
if (MFI.hasStackProtectorIndex()) {
for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
- if (MFI.getStackID(i) == TargetStackID::ScalableVector &&
+ if (MFI.hasScalableStackID(i) &&
MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) {
MFI.setStackID(MFI.getStackProtectorIndex(),
TargetStackID::ScalableVector);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index f07d351..6ef0a95 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -10176,28 +10176,6 @@ multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
(!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>;
}
-multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
- def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
- FPR8, FPR8, vecshiftR8, asm, []> {
- let Inst{18-16} = imm{2-0};
- }
-
- def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
- FPR16, FPR16, vecshiftR16, asm, []> {
- let Inst{19-16} = imm{3-0};
- }
-
- def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
- FPR32, FPR32, vecshiftR32, asm, []> {
- let Inst{20-16} = imm{4-0};
- }
-
- def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
- FPR64, FPR64, vecshiftR64, asm, []> {
- let Inst{21-16} = imm{5-0};
- }
-}
-
//----------------------------------------------------------------------------
// AdvSIMD vector x indexed element
//----------------------------------------------------------------------------
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 35b27ea..5a90da1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5599,7 +5599,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register store without SVE store instructions");
Opc = AArch64::STR_PXI;
- StackID = TargetStackID::ScalableVector;
+ StackID = TargetStackID::ScalablePredicateVector;
}
break;
}
@@ -5614,7 +5614,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
Opc = AArch64::STRSui;
else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
Opc = AArch64::STR_PPXI;
- StackID = TargetStackID::ScalableVector;
+ StackID = TargetStackID::ScalablePredicateVector;
}
break;
case 8:
@@ -5784,7 +5784,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
if (IsPNR)
PNRReg = DestReg;
Opc = AArch64::LDR_PXI;
- StackID = TargetStackID::ScalableVector;
+ StackID = TargetStackID::ScalablePredicateVector;
}
break;
}
@@ -5799,7 +5799,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
Opc = AArch64::LDRSui;
else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
Opc = AArch64::LDR_PPXI;
- StackID = TargetStackID::ScalableVector;
+ StackID = TargetStackID::ScalablePredicateVector;
}
break;
case 8:
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index a81f5b3..b3c9656 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -23,12 +23,21 @@
using namespace llvm;
+static std::optional<uint64_t>
+getSVEStackSize(const AArch64FunctionInfo &MFI,
+ uint64_t (AArch64FunctionInfo::*GetStackSize)() const) {
+ if (!MFI.hasCalculatedStackSizeSVE())
+ return std::nullopt;
+ return (MFI.*GetStackSize)();
+}
+
yaml::AArch64FunctionInfo::AArch64FunctionInfo(
const llvm::AArch64FunctionInfo &MFI)
: HasRedZone(MFI.hasRedZone()),
- StackSizeSVE(MFI.hasCalculatedStackSizeSVE()
- ? std::optional<uint64_t>(MFI.getStackSizeSVE())
- : std::nullopt),
+ StackSizeZPR(
+ getSVEStackSize(MFI, &llvm::AArch64FunctionInfo::getStackSizeZPR)),
+ StackSizePPR(
+ getSVEStackSize(MFI, &llvm::AArch64FunctionInfo::getStackSizePPR)),
HasStackFrame(MFI.hasStackFrame()
? std::optional<bool>(MFI.hasStackFrame())
: std::nullopt) {}
@@ -41,8 +50,9 @@ void AArch64FunctionInfo::initializeBaseYamlFields(
const yaml::AArch64FunctionInfo &YamlMFI) {
if (YamlMFI.HasRedZone)
HasRedZone = YamlMFI.HasRedZone;
- if (YamlMFI.StackSizeSVE)
- setStackSizeSVE(*YamlMFI.StackSizeSVE);
+ if (YamlMFI.StackSizeZPR || YamlMFI.StackSizePPR)
+ setStackSizeSVE(YamlMFI.StackSizeZPR.value_or(0),
+ YamlMFI.StackSizePPR.value_or(0));
if (YamlMFI.HasStackFrame)
setHasStackFrame(*YamlMFI.HasStackFrame);
}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 897c7e8..91e64e6 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -74,13 +74,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
/// Amount of stack frame size, not including callee-saved registers.
uint64_t LocalStackSize = 0;
- /// The start and end frame indices for the SVE callee saves.
- int MinSVECSFrameIndex = 0;
- int MaxSVECSFrameIndex = 0;
-
/// Amount of stack frame size used for saving callee-saved registers.
unsigned CalleeSavedStackSize = 0;
- unsigned SVECalleeSavedStackSize = 0;
+ unsigned ZPRCalleeSavedStackSize = 0;
+ unsigned PPRCalleeSavedStackSize = 0;
bool HasCalleeSavedStackSize = false;
bool HasSVECalleeSavedStackSize = false;
@@ -137,9 +134,14 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
/// SVE stack size (for predicates and data vectors) are maintained here
/// rather than in FrameInfo, as the placement and Stack IDs are target
/// specific.
- uint64_t StackSizeSVE = 0;
+ uint64_t StackSizeZPR = 0;
+ uint64_t StackSizePPR = 0;
+
+ /// Are SVE objects (vectors and predicates) split into separate regions on
+ /// the stack.
+ bool SplitSVEObjects = false;
- /// HasCalculatedStackSizeSVE indicates whether StackSizeSVE is valid.
+ /// HasCalculatedStackSizeSVE indicates whether StackSizeZPR/PPR is valid.
bool HasCalculatedStackSizeSVE = false;
/// Has a value when it is known whether or not the function uses a
@@ -312,16 +314,25 @@ public:
TailCallReservedStack = bytes;
}
- bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; }
-
- void setStackSizeSVE(uint64_t S) {
+ void setStackSizeSVE(uint64_t ZPR, uint64_t PPR) {
+ StackSizeZPR = ZPR;
+ StackSizePPR = PPR;
HasCalculatedStackSizeSVE = true;
- StackSizeSVE = S;
}
- uint64_t getStackSizeSVE() const {
+ uint64_t getStackSizeZPR() const {
+ assert(hasCalculatedStackSizeSVE());
+ return StackSizeZPR;
+ }
+ uint64_t getStackSizePPR() const {
assert(hasCalculatedStackSizeSVE());
- return StackSizeSVE;
+ return StackSizePPR;
+ }
+
+ bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; }
+
+ bool hasSVEStackSize() const {
+ return getStackSizeZPR() > 0 || getStackSizePPR() > 0;
}
bool hasStackFrame() const { return HasStackFrame; }
@@ -329,7 +340,6 @@ public:
bool isStackRealigned() const { return StackRealigned; }
void setStackRealigned(bool s) { StackRealigned = s; }
-
bool hasCalleeSaveStackFreeSpace() const {
return CalleeSaveStackHasFreeSpace;
}
@@ -414,29 +424,37 @@ public:
}
// Saves the CalleeSavedStackSize for SVE vectors in 'scalable bytes'
- void setSVECalleeSavedStackSize(unsigned Size) {
- SVECalleeSavedStackSize = Size;
+ void setSVECalleeSavedStackSize(unsigned ZPR, unsigned PPR) {
+ ZPRCalleeSavedStackSize = ZPR;
+ PPRCalleeSavedStackSize = PPR;
HasSVECalleeSavedStackSize = true;
}
- unsigned getSVECalleeSavedStackSize() const {
+ unsigned getZPRCalleeSavedStackSize() const {
assert(HasSVECalleeSavedStackSize &&
- "SVECalleeSavedStackSize has not been calculated");
- return SVECalleeSavedStackSize;
+ "ZPRCalleeSavedStackSize has not been calculated");
+ return ZPRCalleeSavedStackSize;
}
-
- void setMinMaxSVECSFrameIndex(int Min, int Max) {
- MinSVECSFrameIndex = Min;
- MaxSVECSFrameIndex = Max;
+ unsigned getPPRCalleeSavedStackSize() const {
+ assert(HasSVECalleeSavedStackSize &&
+ "PPRCalleeSavedStackSize has not been calculated");
+ return PPRCalleeSavedStackSize;
}
- int getMinSVECSFrameIndex() const { return MinSVECSFrameIndex; }
- int getMaxSVECSFrameIndex() const { return MaxSVECSFrameIndex; }
+ unsigned getSVECalleeSavedStackSize() const {
+ assert(!hasSplitSVEObjects() &&
+ "ZPRs and PPRs are split. Use get[ZPR|PPR]CalleeSavedStackSize()");
+ return getZPRCalleeSavedStackSize() + getPPRCalleeSavedStackSize();
+ }
void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
unsigned getNumLocalDynamicTLSAccesses() const {
return NumLocalDynamicTLSAccesses;
}
+ bool isStackHazardIncludedInCalleeSaveArea() const {
+ return hasStackHazardSlotIndex() && !hasSplitSVEObjects();
+ }
+
std::optional<bool> hasRedZone() const { return HasRedZone; }
void setHasRedZone(bool s) { HasRedZone = s; }
@@ -472,6 +490,15 @@ public:
StackHazardCSRSlotIndex = Index;
}
+ bool hasSplitSVEObjects() const { return SplitSVEObjects; }
+ void setSplitSVEObjects(bool s) { SplitSVEObjects = s; }
+
+ bool hasSVE_AAPCS(const MachineFunction &MF) const {
+ return hasSplitSVEObjects() || isSVECC() ||
+ MF.getFunction().getCallingConv() ==
+ CallingConv::AArch64_SVE_VectorCall;
+ }
+
SMEAttrs getSMEFnAttrs() const { return SMEFnAttrs; }
unsigned getSRetReturnReg() const { return SRetReturnReg; }
@@ -611,7 +638,8 @@ private:
namespace yaml {
struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo {
std::optional<bool> HasRedZone;
- std::optional<uint64_t> StackSizeSVE;
+ std::optional<uint64_t> StackSizeZPR;
+ std::optional<uint64_t> StackSizePPR;
std::optional<bool> HasStackFrame;
AArch64FunctionInfo() = default;
@@ -624,7 +652,8 @@ struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo {
template <> struct MappingTraits<AArch64FunctionInfo> {
static void mapping(IO &YamlIO, AArch64FunctionInfo &MFI) {
YamlIO.mapOptional("hasRedZone", MFI.HasRedZone);
- YamlIO.mapOptional("stackSizeSVE", MFI.StackSizeSVE);
+ YamlIO.mapOptional("stackSizeZPR", MFI.StackSizeZPR);
+ YamlIO.mapOptional("stackSizePPR", MFI.StackSizePPR);
YamlIO.mapOptional("hasStackFrame", MFI.HasStackFrame);
}
};
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index 09b3643..aed137c 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -48,21 +48,19 @@ bool AArch64PrologueEpilogueCommon::isVGInstruction(
return Opc == TargetOpcode::COPY;
}
-// Convenience function to determine whether I is an SVE callee save.
-static bool isSVECalleeSave(MachineBasicBlock::iterator I) {
+// Convenience function to determine whether I is part of the ZPR callee saves.
+static bool isPartOfZPRCalleeSaves(MachineBasicBlock::iterator I) {
switch (I->getOpcode()) {
default:
return false;
- case AArch64::PTRUE_C_B:
case AArch64::LD1B_2Z_IMM:
case AArch64::ST1B_2Z_IMM:
case AArch64::STR_ZXI:
- case AArch64::STR_PXI:
case AArch64::LDR_ZXI:
- case AArch64::LDR_PXI:
- case AArch64::PTRUE_B:
case AArch64::CPY_ZPzI_B:
case AArch64::CMPNE_PPzZI_B:
+ case AArch64::PTRUE_C_B:
+ case AArch64::PTRUE_B:
return I->getFlag(MachineInstr::FrameSetup) ||
I->getFlag(MachineInstr::FrameDestroy);
case AArch64::SEH_SavePReg:
@@ -71,6 +69,23 @@ static bool isSVECalleeSave(MachineBasicBlock::iterator I) {
}
}
+// Convenience function to determine whether I is part of the PPR callee saves.
+static bool isPartOfPPRCalleeSaves(MachineBasicBlock::iterator I) {
+ switch (I->getOpcode()) {
+ default:
+ return false;
+ case AArch64::STR_PXI:
+ case AArch64::LDR_PXI:
+ return I->getFlag(MachineInstr::FrameSetup) ||
+ I->getFlag(MachineInstr::FrameDestroy);
+ }
+}
+
+// Convenience function to determine whether I is part of the SVE callee saves.
+static bool isPartOfSVECalleeSaves(MachineBasicBlock::iterator I) {
+ return isPartOfZPRCalleeSaves(I) || isPartOfPPRCalleeSaves(I);
+}
+
AArch64PrologueEpilogueCommon::AArch64PrologueEpilogueCommon(
MachineFunction &MF, MachineBasicBlock &MBB,
const AArch64FrameLowering &AFL)
@@ -316,7 +331,7 @@ bool AArch64PrologueEpilogueCommon::shouldCombineCSRLocalStackBump(
// When there is an SVE area on the stack, always allocate the
// callee-saves and spills/locals separately.
- if (AFL.getSVEStackSize(MF))
+ if (AFI->hasSVEStackSize())
return false;
return true;
@@ -639,7 +654,7 @@ void AArch64PrologueEmitter::emitPrologue() {
// Now allocate space for the GPR callee saves.
MachineBasicBlock::iterator MBBI = PrologueBeginI;
- while (MBBI != EndI && isSVECalleeSave(MBBI))
+ while (MBBI != EndI && isPartOfSVECalleeSaves(MBBI))
++MBBI;
FirstGPRSaveI = convertCalleeSaveRestoreToSPPrePostIncDec(
MBBI, DL, -AFI->getCalleeSavedStackSize(), EmitAsyncCFI);
@@ -669,7 +684,7 @@ void AArch64PrologueEmitter::emitPrologue() {
MachineBasicBlock::iterator AfterGPRSavesI = FirstGPRSaveI;
while (AfterGPRSavesI != EndI &&
AfterGPRSavesI->getFlag(MachineInstr::FrameSetup) &&
- !isSVECalleeSave(AfterGPRSavesI)) {
+ !isPartOfSVECalleeSaves(AfterGPRSavesI)) {
if (CombineSPBump &&
// Only fix-up frame-setup load/store instructions.
(!AFL.requiresSaveVG(MF) || !isVGInstruction(AfterGPRSavesI, TLI)))
@@ -700,56 +715,105 @@ void AArch64PrologueEmitter::emitPrologue() {
if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding))
emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding);
- StackOffset SVEStackSize = AFL.getSVEStackSize(MF);
- StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
- MachineBasicBlock::iterator CalleeSavesEnd = AfterGPRSavesI;
+ StackOffset PPRCalleeSavesSize =
+ StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
+ StackOffset ZPRCalleeSavesSize =
+ StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
+ StackOffset SVECalleeSavesSize = PPRCalleeSavesSize + ZPRCalleeSavesSize;
+ StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize;
+ StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize;
+
+ std::optional<MachineBasicBlock::iterator> ZPRCalleeSavesBegin,
+ ZPRCalleeSavesEnd, PPRCalleeSavesBegin, PPRCalleeSavesEnd;
StackOffset CFAOffset =
StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
-
- // Process the SVE callee-saves to determine what space needs to be
- // allocated.
MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI;
- if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
- LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize
- << "\n");
- SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize);
- SVELocalsSize = SVEStackSize - SVECalleeSavesSize;
- // Find callee save instructions in frame.
- // Note: With FPAfterSVECalleeSaves the callee saves have already been
- // allocated.
- if (!FPAfterSVECalleeSaves) {
- MachineBasicBlock::iterator CalleeSavesBegin = AfterGPRSavesI;
- assert(isSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
- while (isSVECalleeSave(AfterSVESavesI) &&
+ if (!FPAfterSVECalleeSaves) {
+ // Process the SVE callee-saves to find the starts/ends of the ZPR and PPR
+ // areas.
+ PPRCalleeSavesBegin = AfterGPRSavesI;
+ if (PPRCalleeSavesSize) {
+ LLVM_DEBUG(dbgs() << "PPRCalleeSavedStackSize = "
+ << PPRCalleeSavesSize.getScalable() << "\n");
+
+ assert(isPartOfPPRCalleeSaves(*PPRCalleeSavesBegin) &&
+ "Unexpected instruction");
+ while (isPartOfPPRCalleeSaves(AfterSVESavesI) &&
+ AfterSVESavesI != MBB.getFirstTerminator())
+ ++AfterSVESavesI;
+ }
+ PPRCalleeSavesEnd = ZPRCalleeSavesBegin = AfterSVESavesI;
+ if (ZPRCalleeSavesSize) {
+ LLVM_DEBUG(dbgs() << "ZPRCalleeSavedStackSize = "
+ << ZPRCalleeSavesSize.getScalable() << "\n");
+ assert(isPartOfZPRCalleeSaves(*ZPRCalleeSavesBegin) &&
+ "Unexpected instruction");
+ while (isPartOfZPRCalleeSaves(AfterSVESavesI) &&
AfterSVESavesI != MBB.getFirstTerminator())
++AfterSVESavesI;
- CalleeSavesEnd = AfterSVESavesI;
-
- StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes);
- // Allocate space for the callee saves (if any).
- allocateStackSpace(CalleeSavesBegin, 0, SVECalleeSavesSize,
- EmitAsyncCFI && !HasFP, CFAOffset,
- MFI.hasVarSizedObjects() || LocalsSize);
}
+ ZPRCalleeSavesEnd = AfterSVESavesI;
}
- CFAOffset += SVECalleeSavesSize;
if (EmitAsyncCFI)
- emitCalleeSavedSVELocations(CalleeSavesEnd);
-
- // Allocate space for the rest of the frame including SVE locals. Align the
- // stack as necessary.
- assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) &&
- "Cannot use redzone with stack realignment");
- if (!AFL.canUseRedZone(MF)) {
- // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
- // the correct value here, as NumBytes also includes padding bytes,
- // which shouldn't be counted here.
- allocateStackSpace(CalleeSavesEnd, RealignmentPadding,
- SVELocalsSize + StackOffset::getFixed(NumBytes),
+ emitCalleeSavedSVELocations(AfterSVESavesI);
+
+ if (AFI->hasSplitSVEObjects()) {
+ assert(!FPAfterSVECalleeSaves &&
+ "Cannot use FPAfterSVECalleeSaves with aarch64-split-sve-objects");
+ assert(!AFL.canUseRedZone(MF) &&
+ "Cannot use redzone with aarch64-split-sve-objects");
+ // TODO: Handle HasWinCFI/NeedsWinCFI?
+ assert(!NeedsWinCFI &&
+ "WinCFI with aarch64-split-sve-objects is not supported");
+
+ // Split ZPR and PPR allocation.
+ // Allocate PPR callee saves
+ allocateStackSpace(*PPRCalleeSavesBegin, 0, PPRCalleeSavesSize,
+ EmitAsyncCFI && !HasFP, CFAOffset,
+ MFI.hasVarSizedObjects() || ZPRCalleeSavesSize ||
+ ZPRLocalsSize || PPRLocalsSize);
+ CFAOffset += PPRCalleeSavesSize;
+
+ // Allocate PPR locals + ZPR callee saves
+ assert(PPRCalleeSavesEnd == ZPRCalleeSavesBegin &&
+ "Expected ZPR callee saves after PPR locals");
+ allocateStackSpace(*PPRCalleeSavesEnd, RealignmentPadding,
+ PPRLocalsSize + ZPRCalleeSavesSize,
+ EmitAsyncCFI && !HasFP, CFAOffset,
+ MFI.hasVarSizedObjects() || ZPRLocalsSize);
+ CFAOffset += PPRLocalsSize + ZPRCalleeSavesSize;
+
+ // Allocate ZPR locals
+ allocateStackSpace(*ZPRCalleeSavesEnd, RealignmentPadding,
+ ZPRLocalsSize + StackOffset::getFixed(NumBytes),
EmitAsyncCFI && !HasFP, CFAOffset,
MFI.hasVarSizedObjects());
+ } else {
+ // Allocate space for the callee saves (if any).
+ StackOffset LocalsSize =
+ PPRLocalsSize + ZPRLocalsSize + StackOffset::getFixed(NumBytes);
+ if (!FPAfterSVECalleeSaves)
+ allocateStackSpace(AfterGPRSavesI, 0, SVECalleeSavesSize,
+ EmitAsyncCFI && !HasFP, CFAOffset,
+ MFI.hasVarSizedObjects() || LocalsSize);
+ CFAOffset += SVECalleeSavesSize;
+
+ // Allocate space for the rest of the frame including SVE locals. Align the
+ // stack as necessary.
+ assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) &&
+ "Cannot use redzone with stack realignment");
+ if (!AFL.canUseRedZone(MF)) {
+ // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
+ // the correct value here, as NumBytes also includes padding bytes,
+ // which shouldn't be counted here.
+ StackOffset SVELocalsSize = PPRLocalsSize + ZPRLocalsSize;
+ allocateStackSpace(AfterSVESavesI, RealignmentPadding,
+ SVELocalsSize + StackOffset::getFixed(NumBytes),
+ EmitAsyncCFI && !HasFP, CFAOffset,
+ MFI.hasVarSizedObjects());
+ }
}
// If we need a base pointer, set it up here. It's whatever the value of the
@@ -796,7 +860,8 @@ void AArch64PrologueEmitter::emitPrologue() {
emitDefineCFAWithFP(AfterSVESavesI, FixedObject);
} else {
StackOffset TotalSize =
- SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
+ AFL.getSVEStackSize(MF) +
+ StackOffset::getFixed((int64_t)MFI.getStackSize());
CFIInstBuilder CFIBuilder(MBB, AfterSVESavesI, MachineInstr::FrameSetup);
CFIBuilder.insertCFIInst(
createDefCFA(RegInfo, /*FrameReg=*/AArch64::SP, /*Reg=*/AArch64::SP,
@@ -1165,7 +1230,7 @@ void AArch64PrologueEmitter::emitCalleeSavedGPRLocations(
CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
for (const auto &Info : CSI) {
unsigned FrameIdx = Info.getFrameIdx();
- if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector)
+ if (MFI.hasScalableStackID(FrameIdx))
continue;
assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
@@ -1191,8 +1256,10 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations(
AFL.getOffsetOfLocalArea();
}
+ StackOffset PPRStackSize = AFL.getPPRStackSize(MF);
for (const auto &Info : CSI) {
- if (MFI.getStackID(Info.getFrameIdx()) != TargetStackID::ScalableVector)
+ int FI = Info.getFrameIdx();
+ if (!MFI.hasScalableStackID(FI))
continue;
// Not all unwinders may know about SVE registers, so assume the lowest
@@ -1203,9 +1270,13 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations(
continue;
StackOffset Offset =
- StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
+ StackOffset::getScalable(MFI.getObjectOffset(FI)) -
StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
+ if (AFI->hasSplitSVEObjects() &&
+ MFI.getStackID(FI) == TargetStackID::ScalableVector)
+ Offset -= PPRStackSize;
+
CFIBuilder.insertCFIInst(
createCFAOffset(RegInfo, Reg, Offset, IncomingVGOffsetFromDefCFA));
}
@@ -1322,7 +1393,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
while (FirstGPRRestoreI != Begin) {
--FirstGPRRestoreI;
if (!FirstGPRRestoreI->getFlag(MachineInstr::FrameDestroy) ||
- (!FPAfterSVECalleeSaves && isSVECalleeSave(FirstGPRRestoreI))) {
+ (!FPAfterSVECalleeSaves && isPartOfSVECalleeSaves(FirstGPRRestoreI))) {
++FirstGPRRestoreI;
break;
} else if (CombineSPBump)
@@ -1346,7 +1417,9 @@ void AArch64EpilogueEmitter::emitEpilogue() {
if (HasFP && AFI->hasSwiftAsyncContext())
emitSwiftAsyncContextFramePointer(EpilogueEndI, DL);
- const StackOffset &SVEStackSize = AFL.getSVEStackSize(MF);
+ StackOffset ZPRStackSize = AFL.getZPRStackSize(MF);
+ StackOffset PPRStackSize = AFL.getPPRStackSize(MF);
+ StackOffset SVEStackSize = ZPRStackSize + PPRStackSize;
// If there is a single SP update, insert it before the ret and we're done.
if (CombineSPBump) {
@@ -1367,106 +1440,188 @@ void AArch64EpilogueEmitter::emitEpilogue() {
NumBytes -= PrologueSaveSize;
assert(NumBytes >= 0 && "Negative stack allocation size!?");
- // Process the SVE callee-saves to determine what space needs to be
- // deallocated.
- StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
- MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI,
- RestoreEnd = FirstGPRRestoreI;
- if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
- if (FPAfterSVECalleeSaves)
- RestoreEnd = MBB.getFirstTerminator();
-
- RestoreBegin = std::prev(RestoreEnd);
- while (RestoreBegin != MBB.begin() &&
- isSVECalleeSave(std::prev(RestoreBegin)))
- --RestoreBegin;
-
- assert(isSVECalleeSave(RestoreBegin) &&
- isSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
-
- StackOffset CalleeSavedSizeAsOffset =
- StackOffset::getScalable(CalleeSavedSize);
- DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
- DeallocateAfter = CalleeSavedSizeAsOffset;
- }
-
- // Deallocate the SVE area.
- if (FPAfterSVECalleeSaves) {
- // If the callee-save area is before FP, restoring the FP implicitly
- // deallocates non-callee-save SVE allocations. Otherwise, deallocate
- // them explicitly.
- if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
- emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
- DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
- NeedsWinCFI, &HasWinCFI);
+ if (!AFI->hasSplitSVEObjects()) {
+ // Process the SVE callee-saves to determine what space needs to be
+ // deallocated.
+ StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
+ MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI,
+ RestoreEnd = FirstGPRRestoreI;
+ int64_t ZPRCalleeSavedSize = AFI->getZPRCalleeSavedStackSize();
+ int64_t PPRCalleeSavedSize = AFI->getPPRCalleeSavedStackSize();
+ int64_t SVECalleeSavedSize = ZPRCalleeSavedSize + PPRCalleeSavedSize;
+
+ if (SVECalleeSavedSize) {
+ if (FPAfterSVECalleeSaves)
+ RestoreEnd = MBB.getFirstTerminator();
+
+ RestoreBegin = std::prev(RestoreEnd);
+ while (RestoreBegin != MBB.begin() &&
+ isPartOfSVECalleeSaves(std::prev(RestoreBegin)))
+ --RestoreBegin;
+
+ assert(isPartOfSVECalleeSaves(RestoreBegin) &&
+ isPartOfSVECalleeSaves(std::prev(RestoreEnd)) &&
+ "Unexpected instruction");
+
+ StackOffset CalleeSavedSizeAsOffset =
+ StackOffset::getScalable(SVECalleeSavedSize);
+ DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
+ DeallocateAfter = CalleeSavedSizeAsOffset;
}
- // Deallocate callee-save non-SVE registers.
- emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
- // Deallocate fixed objects.
- emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(FixedObject), TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
- // Deallocate callee-save SVE registers.
- emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
- DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
- NeedsWinCFI, &HasWinCFI);
- } else if (SVEStackSize) {
- int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize();
- // If we have stack realignment or variable-sized objects we must use the
- // FP to restore SVE callee saves (as there is an unknown amount of
- // data/padding between the SP and SVE CS area).
- Register BaseForSVEDealloc =
- (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
- : AArch64::SP;
- if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) {
- Register CalleeSaveBase = AArch64::FP;
- if (int64_t CalleeSaveBaseOffset =
- AFI->getCalleeSaveBaseToFrameRecordOffset()) {
- // If we have have an non-zero offset to the non-SVE CS base we need to
- // compute the base address by subtracting the offest in a temporary
- // register first (to avoid briefly deallocating the SVE CS).
- CalleeSaveBase =
- MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
- emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
- StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
- MachineInstr::FrameDestroy);
- }
- // The code below will deallocate the stack space space by moving the
- // SP to the start of the SVE callee-save area.
- emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
- StackOffset::getScalable(-SVECalleeSavedSize), TII,
- MachineInstr::FrameDestroy);
- } else if (BaseForSVEDealloc == AArch64::SP) {
- if (SVECalleeSavedSize) {
- // Deallocate the non-SVE locals first before we can deallocate (and
- // restore callee saves) from the SVE area.
- emitFrameOffset(
- MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(NumBytes), TII, MachineInstr::FrameDestroy,
- false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
- SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize));
- NumBytes = 0;
+ // Deallocate the SVE area.
+ if (FPAfterSVECalleeSaves) {
+ // If the callee-save area is before FP, restoring the FP implicitly
+ // deallocates non-callee-save SVE allocations. Otherwise, deallocate
+ // them explicitly.
+ if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
+ emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
+ DeallocateBefore, TII, MachineInstr::FrameDestroy,
+ false, NeedsWinCFI, &HasWinCFI);
}
+ // Deallocate callee-save non-SVE registers.
emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
- DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
- NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
- SVEStackSize +
- StackOffset::getFixed(NumBytes + PrologueSaveSize));
+ StackOffset::getFixed(AFI->getCalleeSavedStackSize()),
+ TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
+ &HasWinCFI);
+
+ // Deallocate fixed objects.
+ emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(FixedObject), TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI,
+ &HasWinCFI);
+ // Deallocate callee-save SVE registers.
emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
- NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
- DeallocateAfter +
- StackOffset::getFixed(NumBytes + PrologueSaveSize));
+ NeedsWinCFI, &HasWinCFI);
+ } else if (SVEStackSize) {
+ int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize();
+ // If we have stack realignment or variable-sized objects we must use the
+ // FP to restore SVE callee saves (as there is an unknown amount of
+ // data/padding between the SP and SVE CS area).
+ Register BaseForSVEDealloc =
+ (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
+ : AArch64::SP;
+ if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) {
+ Register CalleeSaveBase = AArch64::FP;
+ if (int64_t CalleeSaveBaseOffset =
+ AFI->getCalleeSaveBaseToFrameRecordOffset()) {
+ // If we have have an non-zero offset to the non-SVE CS base we need
+ // to compute the base address by subtracting the offest in a
+ // temporary register first (to avoid briefly deallocating the SVE
+ // CS).
+ CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
+ &AArch64::GPR64RegClass);
+ emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
+ StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
+ MachineInstr::FrameDestroy);
+ }
+ // The code below will deallocate the stack space space by moving the
+ // SP to the start of the SVE callee-save area.
+ emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
+ StackOffset::getScalable(-SVECalleeSavedSize), TII,
+ MachineInstr::FrameDestroy);
+ } else if (BaseForSVEDealloc == AArch64::SP) {
+ if (SVECalleeSavedSize) {
+ // Deallocate the non-SVE locals first before we can deallocate (and
+ // restore callee saves) from the SVE area.
+ emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(NumBytes), TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI,
+ &HasWinCFI, EmitCFI && !HasFP,
+ SVEStackSize + StackOffset::getFixed(
+ NumBytes + PrologueSaveSize));
+ NumBytes = 0;
+ }
+
+ emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
+ DeallocateBefore, TII, MachineInstr::FrameDestroy,
+ false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
+ SVEStackSize +
+ StackOffset::getFixed(NumBytes + PrologueSaveSize));
+
+ emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+ DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
+ NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
+ DeallocateAfter +
+ StackOffset::getFixed(NumBytes + PrologueSaveSize));
+ }
+
+ if (EmitCFI)
+ emitCalleeSavedSVERestores(RestoreEnd);
+ }
+ } else if (AFI->hasSplitSVEObjects() && SVEStackSize) {
+ // TODO: Support stack realigment and variable-sized objects.
+ assert(!AFI->isStackRealigned() && !MFI.hasVarSizedObjects() &&
+ "unexpected stack realignment or variable sized objects with split "
+ "SVE stack objects");
+ // SplitSVEObjects. Determine the sizes and starts/ends of the ZPR and PPR
+ // areas.
+ auto ZPRCalleeSavedSize =
+ StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
+ auto PPRCalleeSavedSize =
+ StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
+ StackOffset PPRLocalsSize = PPRStackSize - PPRCalleeSavedSize;
+ StackOffset ZPRLocalsSize = ZPRStackSize - ZPRCalleeSavedSize;
+
+ MachineBasicBlock::iterator PPRRestoreBegin = FirstGPRRestoreI,
+ PPRRestoreEnd = FirstGPRRestoreI;
+ if (PPRCalleeSavedSize) {
+ PPRRestoreBegin = std::prev(PPRRestoreEnd);
+ while (PPRRestoreBegin != MBB.begin() &&
+ isPartOfPPRCalleeSaves(std::prev(PPRRestoreBegin)))
+ --PPRRestoreBegin;
+ }
+
+ MachineBasicBlock::iterator ZPRRestoreBegin = PPRRestoreBegin,
+ ZPRRestoreEnd = PPRRestoreBegin;
+ if (ZPRCalleeSavedSize) {
+ ZPRRestoreBegin = std::prev(ZPRRestoreEnd);
+ while (ZPRRestoreBegin != MBB.begin() &&
+ isPartOfZPRCalleeSaves(std::prev(ZPRRestoreBegin)))
+ --ZPRRestoreBegin;
}
+
+ auto CFAOffset =
+ SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize);
+ if (PPRCalleeSavedSize || ZPRCalleeSavedSize) {
+ // Deallocate the non-SVE locals first before we can deallocate (and
+ // restore callee saves) from the SVE area.
+ auto NonSVELocals = StackOffset::getFixed(NumBytes);
+ emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
+ NonSVELocals, TII, MachineInstr::FrameDestroy, false,
+ false, nullptr, EmitCFI && !HasFP, CFAOffset);
+ NumBytes = 0;
+ CFAOffset -= NonSVELocals;
+ }
+
+ if (ZPRLocalsSize) {
+ emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
+ ZPRLocalsSize, TII, MachineInstr::FrameDestroy, false,
+ false, nullptr, EmitCFI && !HasFP, CFAOffset);
+ CFAOffset -= ZPRLocalsSize;
+ }
+
+ if (PPRLocalsSize || ZPRCalleeSavedSize) {
+ assert(PPRRestoreBegin == ZPRRestoreEnd &&
+ "Expected PPR restores after ZPR");
+ emitFrameOffset(MBB, PPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
+ PPRLocalsSize + ZPRCalleeSavedSize, TII,
+ MachineInstr::FrameDestroy, false, false, nullptr,
+ EmitCFI && !HasFP, CFAOffset);
+ CFAOffset -= PPRLocalsSize + ZPRCalleeSavedSize;
+ }
+ if (PPRCalleeSavedSize) {
+ emitFrameOffset(MBB, PPRRestoreEnd, DL, AArch64::SP, AArch64::SP,
+ PPRCalleeSavedSize, TII, MachineInstr::FrameDestroy,
+ false, false, nullptr, EmitCFI && !HasFP, CFAOffset);
+ }
+
+ // We only emit CFI information for ZPRs so emit CFI after the ZPR restores.
if (EmitCFI)
- emitCalleeSavedSVERestores(RestoreEnd);
+ emitCalleeSavedSVERestores(ZPRRestoreEnd);
}
if (!HasFP) {
@@ -1624,8 +1779,7 @@ void AArch64EpilogueEmitter::emitCalleeSavedRestores(
CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameDestroy);
for (const auto &Info : CSI) {
- if (SVE !=
- (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
+ if (SVE != MFI.hasScalableStackID(Info.getFrameIdx()))
continue;
MCRegister Reg = Info.getReg();
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 2b0c8ad..79975b0 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -71,6 +71,7 @@ bool AArch64RegisterInfo::regNeedsCFI(MCRegister Reg,
const MCPhysReg *
AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
assert(MF && "Invalid MachineFunction pointer.");
+ auto &AFI = *MF->getInfo<AArch64FunctionInfo>();
if (MF->getFunction().getCallingConv() == CallingConv::GHC)
// GHC set of callee saved regs is empty as all those regs are
@@ -101,10 +102,7 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
return CSR_Win_AArch64_AAPCS_SwiftTail_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall)
return CSR_Win_AArch64_AAVPCS_SaveList;
- if (MF->getFunction().getCallingConv() ==
- CallingConv::AArch64_SVE_VectorCall)
- return CSR_Win_AArch64_SVE_AAPCS_SaveList;
- if (MF->getInfo<AArch64FunctionInfo>()->isSVECC())
+ if (AFI.hasSVE_AAPCS(*MF))
return CSR_Win_AArch64_SVE_AAPCS_SaveList;
return CSR_Win_AArch64_AAPCS_SaveList;
}
@@ -148,7 +146,7 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
// This is for OSes other than Windows; Windows is a separate case further
// above.
return CSR_AArch64_AAPCS_X18_SaveList;
- if (MF->getInfo<AArch64FunctionInfo>()->isSVECC())
+ if (AFI.hasSVE_AAPCS(*MF))
return CSR_AArch64_SVE_AAPCS_SaveList;
return CSR_AArch64_AAPCS_SaveList;
}
@@ -158,6 +156,7 @@ AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const {
assert(MF && "Invalid MachineFunction pointer.");
assert(MF->getSubtarget<AArch64Subtarget>().isTargetDarwin() &&
"Invalid subtarget for getDarwinCalleeSavedRegs");
+ auto &AFI = *MF->getInfo<AArch64FunctionInfo>();
if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
report_fatal_error(
@@ -205,7 +204,7 @@ AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const {
return CSR_Darwin_AArch64_RT_AllRegs_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::Win64)
return CSR_Darwin_AArch64_AAPCS_Win64_SaveList;
- if (MF->getInfo<AArch64FunctionInfo>()->isSVECC())
+ if (AFI.hasSVE_AAPCS(*MF))
return CSR_Darwin_AArch64_SVE_AAPCS_SaveList;
return CSR_Darwin_AArch64_AAPCS_SaveList;
}
@@ -643,7 +642,7 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
if (ST.hasSVE() || ST.isStreaming()) {
// Frames that have variable sized objects and scalable SVE objects,
// should always use a basepointer.
- if (!AFI->hasCalculatedStackSizeSVE() || AFI->getStackSizeSVE())
+ if (!AFI->hasCalculatedStackSizeSVE() || AFI->hasSVEStackSize())
return true;
}
@@ -783,7 +782,7 @@ AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
assert((!MF.getSubtarget<AArch64Subtarget>().hasSVE() ||
AFI->hasCalculatedStackSizeSVE()) &&
"Expected SVE area to be calculated by this point");
- return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->getStackSizeSVE() &&
+ return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->hasSVEStackSize() &&
!AFI->hasStackHazardSlotIndex();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index f01d5f6..6efa78e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -608,6 +608,8 @@ public:
? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot]
: EmptySet;
+ const size_t HybridModuleRootKernelsSize = HybridModuleRootKernels.size();
+
for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
// Each iteration of this loop assigns exactly one global variable to
// exactly one of the implementation strategies.
@@ -647,7 +649,8 @@ public:
ModuleScopeVariables.insert(GV);
} else if (K.second.size() == 1) {
KernelAccessVariables.insert(GV);
- } else if (set_is_subset(K.second, HybridModuleRootKernels)) {
+ } else if (K.second.size() == HybridModuleRootKernelsSize &&
+ set_is_subset(K.second, HybridModuleRootKernels)) {
ModuleScopeVariables.insert(GV);
} else {
TableLookupVariables.insert(GV);
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 7c5d4fc..e4b3528 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -924,6 +924,7 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
case TargetStackID::SGPRSpill:
return true;
case TargetStackID::ScalableVector:
+ case TargetStackID::ScalablePredicateVector:
case TargetStackID::WasmLocal:
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 8f1dd62..5630580 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1163,6 +1163,22 @@ def VS_64_Lo256 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32,
let HasSGPR = 1;
let Size = 64;
}
+
+def VS_128 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32,
+ (add VReg_128, SReg_128)> {
+ let isAllocatable = 0;
+ let HasVGPR = 1;
+ let HasSGPR = 1;
+ let Size = 128;
+}
+
+def VS_128_Align2 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32,
+ (add VReg_128_Align2, SReg_128)> {
+ let isAllocatable = 0;
+ let HasVGPR = 1;
+ let HasSGPR = 1;
+ let Size = 128;
+}
} // End GeneratePressureSet = 0
// Define a register tuple class, along with one requiring an even
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index b3fd8c7..84287b6 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -352,10 +352,12 @@ def S_XNOR_SAVEEXEC_B64 : SOP1_64 <"s_xnor_saveexec_b64">;
} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC]
+let Defs = [SCC] in {
def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32",
[(set i32:$sdst, (int_amdgcn_s_quadmask i32:$src0))]>;
def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64",
[(set i64:$sdst, (int_amdgcn_s_quadmask i64:$src0))]>;
+}
let Uses = [M0] in {
def S_MOVRELS_B32 : SOP1_32R <"s_movrels_b32">;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index fa130a1..26ff54c 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -775,6 +775,16 @@ class VectorType;
bool shouldFoldConstantShiftPairToMask(const SDNode *N,
CombineLevel Level) const override;
+ /// Return true if it is profitable to fold a pair of shifts into a mask.
+ bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override {
+ EVT VT = Y.getValueType();
+
+ if (VT.isVector())
+ return false;
+
+ return VT.getScalarSizeInBits() <= 32;
+ }
+
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
unsigned SelectOpcode, SDValue X,
SDValue Y) const override;
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 45d194e..939841a 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -2804,6 +2804,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
case Hexagon::V6_vL32b_nt_cur_npred_ai:
case Hexagon::V6_vL32b_nt_tmp_pred_ai:
case Hexagon::V6_vL32b_nt_tmp_npred_ai:
+ case Hexagon::V6_vS32Ub_npred_ai:
case Hexagon::V6_vgathermh_pseudo:
case Hexagon::V6_vgathermw_pseudo:
case Hexagon::V6_vgathermhw_pseudo:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 3ac7c28..8c21746 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -638,6 +638,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// No support for these operations with v2f32/v2i32
setOperationAction(ISD::INSERT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand);
+
+ setOperationAction(ISD::TRUNCATE, MVT::v2i16, Expand);
+ setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
+ MVT::v2i32, Expand);
+
// Need custom lowering in case the index is dynamic.
if (STI.hasF32x2Instructions())
setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32},
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 06ce917..7d4535a 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -2395,6 +2395,7 @@ bool RISCVFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
case TargetStackID::NoAlloc:
case TargetStackID::SGPRSpill:
case TargetStackID::WasmLocal:
+ case TargetStackID::ScalablePredicateVector:
return false;
}
llvm_unreachable("Invalid TargetStackID::Value");
diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index 19d5aff..cf6f83a 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -110,16 +110,16 @@ def : StPat<truncstorei8, SB, GPR, i16>;
let Predicates = [HasAtomicLdSt] in {
// Prefer unsigned due to no c.lb in Zcb.
- def : LdPat<atomic_load_aext_8, LBU, i16>;
- def : LdPat<atomic_load_nonext_16, LH, i16>;
+ def : LdPat<relaxed_load<atomic_load_aext_8>, LBU, i16>;
+ def : LdPat<relaxed_load<atomic_load_nonext_16>, LH, i16>;
- def : StPat<atomic_store_8, SB, GPR, i16>;
- def : StPat<atomic_store_16, SH, GPR, i16>;
+ def : StPat<relaxed_store<atomic_store_8>, SB, GPR, i16>;
+ def : StPat<relaxed_store<atomic_store_16>, SH, GPR, i16>;
}
let Predicates = [HasAtomicLdSt, IsRV64] in {
- def : LdPat<atomic_load_nonext_32, LW, i32>;
- def : StPat<atomic_store_32, SW, GPR, i32>;
+ // Load pattern is in RISCVInstrInfoA.td and shared with RV32.
+ def : StPat<relaxed_store<atomic_store_32>, SW, GPR, i32>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index 99992d1..25accd9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -174,15 +174,14 @@ let Predicates = [HasAtomicLdSt] in {
def : StPat<relaxed_store<atomic_store_8>, SB, GPR, XLenVT>;
def : StPat<relaxed_store<atomic_store_16>, SH, GPR, XLenVT>;
def : StPat<relaxed_store<atomic_store_32>, SW, GPR, XLenVT>;
-}
-let Predicates = [HasAtomicLdSt, IsRV32] in {
- def : LdPat<relaxed_load<atomic_load_nonext_32>, LW>;
+ // Used by GISel for RV32 and RV64.
+ def : LdPat<relaxed_load<atomic_load_nonext_32>, LW, i32>;
}
let Predicates = [HasAtomicLdSt, IsRV64] in {
- def : LdPat<relaxed_load<atomic_load_asext_32>, LW>;
- def : LdPat<relaxed_load<atomic_load_zext_32>, LWU>;
+ def : LdPat<relaxed_load<atomic_load_asext_32>, LW, i64>;
+ def : LdPat<relaxed_load<atomic_load_zext_32>, LWU, i64>;
def : LdPat<relaxed_load<atomic_load_nonext_64>, LD, i64>;
def : StPat<relaxed_store<atomic_store_64>, SD, GPR, i64>;
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index d998316..298d35a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -554,7 +554,8 @@ defset list<VTypeInfoToWide> AllWidenableBF16ToFloatVectors = {
// This represents the information we need in codegen for each pseudo.
// The definition should be consistent with `struct PseudoInfo` in
// RISCVInstrInfo.h.
-class RISCVVPseudo<dag outs, dag ins, list<dag> pattern = [], string opcodestr = "", string argstr = "">
+class RISCVVPseudo<dag outs, dag ins, list<dag> pattern = [],
+ string opcodestr = "", string argstr = "">
: Pseudo<outs, ins, pattern, opcodestr, argstr> {
Pseudo Pseudo = !cast<Pseudo>(NAME); // Used as a key.
Instruction BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
@@ -1010,8 +1011,7 @@ class VPseudoNullaryNoMask<VReg RegClass> :
class VPseudoNullaryMask<VReg RegClass> :
RISCVVPseudo<(outs GetVRegNoV0<RegClass>.R:$rd),
(ins GetVRegNoV0<RegClass>.R:$passthru,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
- []> {
+ VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1190,8 +1190,7 @@ class VPseudoBinaryNoMask<VReg RetClass,
bits<2> TargetConstraintType = 1,
DAGOperand sewop = sew> :
RISCVVPseudo<(outs RetClass:$rd),
- (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, sewop:$sew),
- []> {
+ (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, sewop:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1227,8 +1226,7 @@ class VPseudoBinaryNoMaskRoundingMode<VReg RetClass,
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1,
- vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy),
- []> {
+ vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1320,7 +1318,7 @@ class VPseudoIStoreNoMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
bit Ordered>:
RISCVVPseudo<(outs),
(ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2,
- AVL:$vl, sew:$sew),[]>,
+ AVL:$vl, sew:$sew)>,
RISCVVSX</*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
@@ -1333,7 +1331,7 @@ class VPseudoIStoreMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
bit Ordered>:
RISCVVPseudo<(outs),
(ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2,
- VMaskOp:$vm, AVL:$vl, sew:$sew),[]>,
+ VMaskOp:$vm, AVL:$vl, sew:$sew)>,
RISCVVSX</*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
@@ -1351,8 +1349,7 @@ class VPseudoBinaryMaskPolicy<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru,
Op1Class:$rs2, Op2Class:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
- []> {
+ VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1371,8 +1368,7 @@ class VPseudoTernaryMaskPolicy<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru,
Op1Class:$rs2, Op2Class:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
- []> {
+ VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1414,8 +1410,7 @@ class VPseudoBinaryMOutMask<VReg RetClass,
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$passthru,
Op1Class:$rs2, Op2Class:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
- []> {
+ VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1438,8 +1433,7 @@ class VPseudoTiedBinaryMask<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru,
Op2Class:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
- []> {
+ VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1546,8 +1540,7 @@ class VPseudoTernaryNoMaskWithPolicyRoundingMode<VReg RetClass,
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
- vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy),
- []> {
+ vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1716,8 +1709,8 @@ class VPseudoUSSegStoreNoMask<VReg ValClass,
int EEW,
bits<4> NF> :
RISCVVPseudo<(outs),
- (ins ValClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl, sew:$sew),
- []>,
+ (ins ValClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl,
+ sew:$sew)>,
RISCVVSSEG<NF, /*Masked*/0, /*Strided*/0, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -6029,9 +6022,9 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 1 in {
PseudoInstExpansion<(CSRRS GPR:$rd, SysRegVLENB.Encoding, X0)>,
Sched<[WriteRdVLENB]>;
let Defs = [VL, VTYPE] in {
- def PseudoReadVLENBViaVSETVLIX0 : Pseudo<(outs GPRNoX0:$rd), (ins uimm5:$shamt),
- []>,
- Sched<[WriteVSETVLI, ReadVSETVLI]>;
+ def PseudoReadVLENBViaVSETVLIX0 : Pseudo<(outs GPRNoX0:$rd),
+ (ins uimm5:$shamt), []>,
+ Sched<[WriteVSETVLI, ReadVSETVLI]>;
}
}
@@ -6694,14 +6687,14 @@ defm PseudoVID : VPseudoVID_V;
let Predicates = [HasVInstructions] in {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
let HasSEWOp = 1, BaseInstr = VMV_X_S in
- def PseudoVMV_X_S:
+ def PseudoVMV_X_S :
RISCVVPseudo<(outs GPR:$rd), (ins VR:$rs2, sew:$sew)>,
Sched<[WriteVMovXS, ReadVMovXS]>;
let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, isReMaterializable = 1,
Constraints = "$rd = $passthru" in
- def PseudoVMV_S_X: RISCVVPseudo<(outs VR:$rd),
- (ins VR:$passthru, GPR:$rs1, AVL:$vl, sew:$sew),
- []>,
+ def PseudoVMV_S_X :
+ RISCVVPseudo<(outs VR:$rd),
+ (ins VR:$passthru, GPR:$rs1, AVL:$vl, sew:$sew)>,
Sched<[WriteVMovSX, ReadVMovSX_V, ReadVMovSX_X]>;
}
} // Predicates = [HasVInstructions]
@@ -6721,8 +6714,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
Constraints = "$rd = $passthru" in
def "PseudoVFMV_S_" # f.FX :
RISCVVPseudo<(outs VR:$rd),
- (ins VR:$passthru, f.fprclass:$rs1, AVL:$vl, sew:$sew),
- []>,
+ (ins VR:$passthru, f.fprclass:$rs1, AVL:$vl, sew:$sew)>,
Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>;
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
index 5e013b4..1674c95 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
@@ -63,13 +63,14 @@ defm SD : SRL_r_aq_rl<0b011, "sd">;
//===----------------------------------------------------------------------===//
class PatLAQ<SDPatternOperator OpNode, RVInst Inst, ValueType vt = XLenVT>
- : Pat<(vt (OpNode (vt GPRMemZeroOffset:$rs1))), (Inst GPRMemZeroOffset:$rs1)>;
+ : Pat<(vt (OpNode (XLenVT GPRMemZeroOffset:$rs1))),
+ (Inst GPRMemZeroOffset:$rs1)>;
// n.b. this switches order of arguments
// to deal with the fact that SRL has addr, data
// while atomic_store has data, addr
class PatSRL<SDPatternOperator OpNode, RVInst Inst, ValueType vt = XLenVT>
- : Pat<(OpNode (vt GPR:$rs2), (vt GPRMemZeroOffset:$rs1)),
+ : Pat<(OpNode (vt GPR:$rs2), (XLenVT GPRMemZeroOffset:$rs1)),
(Inst GPRMemZeroOffset:$rs1, GPR:$rs2)>;
@@ -97,16 +98,15 @@ let Predicates = [HasStdExtZalasr] in {
let Predicates = [HasStdExtZalasr, IsRV32] in {
def : PatLAQ<acquiring_load<atomic_load_nonext_32>, LW_AQ>;
def : PatLAQ<seq_cst_load<atomic_load_nonext_32>, LW_AQ>;
-
-} // Predicates = [HasStdExtZalasr, IsRV64]
+} // Predicates = [HasStdExtZalasr, IsRV32]
let Predicates = [HasStdExtZalasr, IsRV64] in {
- def : PatLAQ<acquiring_load<atomic_load_asext_32>, LW_AQ>;
- def : PatLAQ<seq_cst_load<atomic_load_asext_32>, LW_AQ>;
+ def : PatLAQ<acquiring_load<atomic_load_asext_32>, LW_AQ, i64>;
+ def : PatLAQ<seq_cst_load<atomic_load_asext_32>, LW_AQ, i64>;
- def : PatLAQ<acquiring_load<atomic_load_nonext_64>, LD_AQ>;
- def : PatLAQ<seq_cst_load<atomic_load_nonext_64>, LD_AQ>;
+ def : PatLAQ<acquiring_load<atomic_load_nonext_64>, LD_AQ, i64>;
+ def : PatLAQ<seq_cst_load<atomic_load_nonext_64>, LD_AQ, i64>;
- def : PatSRL<releasing_store<atomic_store_64>, SD_RL>;
- def : PatSRL<seq_cst_store<atomic_store_64>, SD_RL>;
+ def : PatSRL<releasing_store<atomic_store_64>, SD_RL, i64>;
+ def : PatSRL<seq_cst_store<atomic_store_64>, SD_RL, i64>;
} // Predicates = [HasStdExtZalasr, IsRV64]
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
index aea3397..205895e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
@@ -39,6 +39,7 @@ private:
void collectBindingInfo(Module &M);
uint32_t getAndReserveFirstUnusedBinding(uint32_t DescSet);
void replaceImplicitBindingCalls(Module &M);
+ void verifyUniqueOrderIdPerResource(SmallVectorImpl<CallInst *> &Calls);
// A map from descriptor set to a bit vector of used binding numbers.
std::vector<BitVector> UsedBindings;
@@ -94,6 +95,33 @@ void SPIRVLegalizeImplicitBinding::collectBindingInfo(Module &M) {
});
}
+void SPIRVLegalizeImplicitBinding::verifyUniqueOrderIdPerResource(
+ SmallVectorImpl<CallInst *> &Calls) {
+ // Check that the order Id is unique per resource.
+ for (uint32_t i = 1; i < Calls.size(); ++i) {
+ const uint32_t OrderIdArgIdx = 0;
+ const uint32_t DescSetArgIdx = 1;
+ const uint32_t OrderA =
+ cast<ConstantInt>(Calls[i - 1]->getArgOperand(OrderIdArgIdx))
+ ->getZExtValue();
+ const uint32_t OrderB =
+ cast<ConstantInt>(Calls[i]->getArgOperand(OrderIdArgIdx))
+ ->getZExtValue();
+ if (OrderA == OrderB) {
+ const uint32_t DescSetA =
+ cast<ConstantInt>(Calls[i - 1]->getArgOperand(DescSetArgIdx))
+ ->getZExtValue();
+ const uint32_t DescSetB =
+ cast<ConstantInt>(Calls[i]->getArgOperand(DescSetArgIdx))
+ ->getZExtValue();
+ if (DescSetA != DescSetB) {
+ report_fatal_error("Implicit binding calls with the same order ID must "
+ "have the same descriptor set");
+ }
+ }
+ }
+}
+
uint32_t SPIRVLegalizeImplicitBinding::getAndReserveFirstUnusedBinding(
uint32_t DescSet) {
if (UsedBindings.size() <= DescSet) {
@@ -112,11 +140,23 @@ uint32_t SPIRVLegalizeImplicitBinding::getAndReserveFirstUnusedBinding(
}
void SPIRVLegalizeImplicitBinding::replaceImplicitBindingCalls(Module &M) {
+ uint32_t lastOrderId = -1;
+ uint32_t lastBindingNumber = -1;
+
for (CallInst *OldCI : ImplicitBindingCalls) {
IRBuilder<> Builder(OldCI);
+ const uint32_t OrderId =
+ cast<ConstantInt>(OldCI->getArgOperand(0))->getZExtValue();
const uint32_t DescSet =
cast<ConstantInt>(OldCI->getArgOperand(1))->getZExtValue();
- const uint32_t NewBinding = getAndReserveFirstUnusedBinding(DescSet);
+
+ // Reuse an existing binding for this order ID, if one was already assigned.
+ // Otherwise, assign a new binding.
+ const uint32_t NewBinding = (lastOrderId == OrderId)
+ ? lastBindingNumber
+ : getAndReserveFirstUnusedBinding(DescSet);
+ lastOrderId = OrderId;
+ lastBindingNumber = NewBinding;
SmallVector<Value *, 8> Args;
Args.push_back(Builder.getInt32(DescSet));
@@ -142,6 +182,7 @@ bool SPIRVLegalizeImplicitBinding::runOnModule(Module &M) {
if (ImplicitBindingCalls.empty()) {
return false;
}
+ verifyUniqueOrderIdPerResource(ImplicitBindingCalls);
replaceImplicitBindingCalls(M);
return true;
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 2cfdc75..a068138 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -957,6 +957,8 @@ const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
EVT VETargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
EVT VT) const {
+ if (VT.isVector())
+ return VT.changeVectorElementType(MVT::i1);
return MVT::i32;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 34854e4..3802506 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45457,7 +45457,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
const SDLoc &DL,
const X86Subtarget &Subtarget) {
EVT SrcVT = Src.getValueType();
- if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
+ if (Subtarget.useSoftFloat() || !SrcVT.isSimple() ||
+ SrcVT.getScalarType() != MVT::i1)
return SDValue();
// Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
@@ -52388,16 +52389,41 @@ static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
// Do not flip "e > c", where "c" is a constant, because Cmp instruction
// cannot take an immediate as its first operand.
//
- if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
- EFLAGS.getValueType().isInteger() &&
- !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
- SDValue NewSub =
- DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
- EFLAGS.getOperand(1), EFLAGS.getOperand(0));
- SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+ // If EFLAGS is from a CMP that compares the same operands as the earlier
+ // SUB producing X (i.e. CMP X, Y), we can directly use the carry flag with
+ // SBB/ADC without creating a flipped SUB.
+ if (EFLAGS.getOpcode() == X86ISD::CMP &&
+ EFLAGS.getValueType().isInteger() && X == EFLAGS.getOperand(0)) {
return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
DAG.getVTList(VT, MVT::i32), X,
- DAG.getConstant(0, DL, VT), NewEFLAGS);
+ DAG.getConstant(0, DL, VT), EFLAGS);
+ }
+
+ if (EFLAGS.getOpcode() == X86ISD::SUB &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ // Only create NewSub if we know one of the folds will succeed to avoid
+ // introducing a temporary node that may persist and affect one-use checks
+ // below.
+ if (EFLAGS.getNode()->hasOneUse()) {
+ SDValue NewSub = DAG.getNode(
+ X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+ return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
+ DAG.getVTList(VT, MVT::i32), X,
+ DAG.getConstant(0, DL, VT), NewEFLAGS);
+ }
+
+ if (IsSub && X == EFLAGS.getValue(0)) {
+ SDValue NewSub = DAG.getNode(
+ X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+ return DAG.getNode(X86ISD::SBB, DL, DAG.getVTList(VT, MVT::i32),
+ EFLAGS.getOperand(0), EFLAGS.getOperand(1),
+ NewEFLAGS);
+ }
}
}
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 278ae46..0ba71ad 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -854,6 +854,7 @@ public:
: Func(F), SC(ShapeC), DT(nullptr) {}
bool combineCastStore(IntrinsicInst *Cast, StoreInst *ST);
bool combineLoadCast(IntrinsicInst *Cast, LoadInst *LD);
+ bool combineTilezero(IntrinsicInst *Cast);
bool combineLdSt(SmallVectorImpl<Instruction *> &Casts);
bool combineAMXcast(TargetLibraryInfo *TLI);
bool transformAMXCast(IntrinsicInst *AMXCast);
@@ -1175,6 +1176,26 @@ bool X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) {
return EraseLoad;
}
+// %19 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
+// -->
+// %19 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col)
+bool X86LowerAMXCast::combineTilezero(IntrinsicInst *Cast) {
+ Value *Row = nullptr, *Col = nullptr;
+ Use &U = *(Cast->use_begin());
+ unsigned OpNo = U.getOperandNo();
+ auto *II = cast<IntrinsicInst>(U.getUser());
+ if (!isAMXIntrinsic(II))
+ return false;
+
+ std::tie(Row, Col) = SC->getShape(II, OpNo);
+
+ IRBuilder<> Builder(Cast);
+ Value *NewInst =
+ Builder.CreateIntrinsic(Intrinsic::x86_tilezero_internal, {}, {Row, Col});
+ Cast->replaceAllUsesWith(NewInst);
+ return true;
+}
+
bool X86LowerAMXCast::combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
bool Change = false;
for (auto *Cast : Casts) {
@@ -1198,6 +1219,14 @@ bool X86LowerAMXCast::combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
for (auto *Store : DeadStores)
Store->eraseFromParent();
} else { // x86_cast_vector_to_tile
+ // %19 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
+ // -->
+ // %19 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col)
+ if (isa<ConstantAggregateZero>(Cast->getOperand(0))) {
+ Change |= combineTilezero(cast<IntrinsicInst>(Cast));
+ continue;
+ }
+
auto *Load = dyn_cast<LoadInst>(Cast->getOperand(0));
if (!Load || !Load->hasOneUse())
continue;
@@ -1210,6 +1239,7 @@ bool X86LowerAMXCast::combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
// Set the operand is null so that load instruction can be erased.
Cast->setOperand(0, nullptr);
Load->eraseFromParent();
+ Change = true;
}
}
}
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index c4f1b68..ddb95a4 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -3981,7 +3981,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
void ModuleCallsiteContextGraph::updateAllocationCall(
CallInfo &Call, AllocationType AllocType) {
std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
- removeAnyExistingAmbiguousAttribute(cast<CallBase>(Call.call()));
auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
"memprof", AllocTypeString);
cast<CallBase>(Call.call())->addFnAttr(A);
@@ -5643,7 +5642,6 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// clone J-1 (J==0 is the original clone and does not have a VMaps
// entry).
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
- removeAnyExistingAmbiguousAttribute(CBClone);
CBClone->addFnAttr(A);
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
<< ore::NV("AllocationCall", CBClone) << " in clone "
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index e4cb4574..07ad65c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5780,6 +5780,45 @@ Instruction *InstCombinerImpl::foldICmpWithMinMax(Instruction &I,
return nullptr;
}
+/// Match and fold patterns like:
+/// icmp eq/ne X, min(max(X, Lo), Hi)
+/// which represents a range check and can be repsented as a ConstantRange.
+///
+/// For icmp eq, build ConstantRange [Lo, Hi + 1) and convert to:
+/// (X - Lo) u< (Hi + 1 - Lo)
+/// For icmp ne, build ConstantRange [Hi + 1, Lo) and convert to:
+/// (X - (Hi + 1)) u< (Lo - (Hi + 1))
+Instruction *InstCombinerImpl::foldICmpWithClamp(ICmpInst &I, Value *X,
+ MinMaxIntrinsic *Min) {
+ if (!I.isEquality() || !Min->hasOneUse() || !Min->isMin())
+ return nullptr;
+
+ const APInt *Lo = nullptr, *Hi = nullptr;
+ if (Min->isSigned()) {
+ if (!match(Min->getLHS(), m_OneUse(m_SMax(m_Specific(X), m_APInt(Lo)))) ||
+ !match(Min->getRHS(), m_APInt(Hi)) || !Lo->slt(*Hi))
+ return nullptr;
+ } else {
+ if (!match(Min->getLHS(), m_OneUse(m_UMax(m_Specific(X), m_APInt(Lo)))) ||
+ !match(Min->getRHS(), m_APInt(Hi)) || !Lo->ult(*Hi))
+ return nullptr;
+ }
+
+ ConstantRange CR = ConstantRange::getNonEmpty(*Lo, *Hi + 1);
+ ICmpInst::Predicate Pred;
+ APInt C, Offset;
+ if (I.getPredicate() == ICmpInst::ICMP_EQ)
+ CR.getEquivalentICmp(Pred, C, Offset);
+ else
+ CR.inverse().getEquivalentICmp(Pred, C, Offset);
+
+ if (!Offset.isZero())
+ X = Builder.CreateAdd(X, ConstantInt::get(X->getType(), Offset));
+
+ return replaceInstUsesWith(
+ I, Builder.CreateICmp(Pred, X, ConstantInt::get(X->getType(), C)));
+}
+
// Canonicalize checking for a power-of-2-or-zero value:
static Instruction *foldICmpPow2Test(ICmpInst &I,
InstCombiner::BuilderTy &Builder) {
@@ -7467,10 +7506,14 @@ Instruction *InstCombinerImpl::foldICmpCommutative(CmpPredicate Pred,
if (Instruction *NI = foldSelectICmp(Pred, SI, Op1, CxtI))
return NI;
- if (auto *MinMax = dyn_cast<MinMaxIntrinsic>(Op0))
+ if (auto *MinMax = dyn_cast<MinMaxIntrinsic>(Op0)) {
if (Instruction *Res = foldICmpWithMinMax(CxtI, MinMax, Op1, Pred))
return Res;
+ if (Instruction *Res = foldICmpWithClamp(CxtI, Op1, MinMax))
+ return Res;
+ }
+
{
Value *X;
const APInt *C;
@@ -8527,6 +8570,9 @@ static Instruction *foldFCmpFSubIntoFCmp(FCmpInst &I, Instruction *LHSI,
DenormalMode::getIEEE()) {
CI.replaceOperand(I, 0, X);
CI.replaceOperand(I, 1, Y);
+ I.setHasNoInfs(LHSI->hasNoInfs());
+ if (LHSI->hasNoNaNs())
+ I.setHasNoNaNs(true);
return &I;
}
break;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 4f94aa2..e01c145 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -725,6 +725,7 @@ public:
Instruction *foldICmpBinOp(ICmpInst &Cmp, const SimplifyQuery &SQ);
Instruction *foldICmpWithMinMax(Instruction &I, MinMaxIntrinsic *MinMax,
Value *Z, CmpPredicate Pred);
+ Instruction *foldICmpWithClamp(ICmpInst &Cmp, Value *X, MinMaxIntrinsic *Min);
Instruction *foldICmpEquality(ICmpInst &Cmp);
Instruction *foldIRemByPowerOfTwoToBitTest(ICmpInst &I);
Instruction *foldSignBitTest(ICmpInst &I);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index b6b3a95..87000a1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2934,32 +2934,6 @@ static Instruction *foldSelectWithSRem(SelectInst &SI, InstCombinerImpl &IC,
return nullptr;
}
-static Value *foldSelectWithFrozenICmp(SelectInst &Sel, InstCombiner::BuilderTy &Builder) {
- FreezeInst *FI = dyn_cast<FreezeInst>(Sel.getCondition());
- if (!FI)
- return nullptr;
-
- Value *Cond = FI->getOperand(0);
- Value *TrueVal = Sel.getTrueValue(), *FalseVal = Sel.getFalseValue();
-
- // select (freeze(x == y)), x, y --> y
- // select (freeze(x != y)), x, y --> x
- // The freeze should be only used by this select. Otherwise, remaining uses of
- // the freeze can observe a contradictory value.
- // c = freeze(x == y) ; Let's assume that y = poison & x = 42; c is 0 or 1
- // a = select c, x, y ;
- // f(a, c) ; f(poison, 1) cannot happen, but if a is folded
- // ; to y, this can happen.
- CmpPredicate Pred;
- if (FI->hasOneUse() &&
- match(Cond, m_c_ICmp(Pred, m_Specific(TrueVal), m_Specific(FalseVal))) &&
- (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)) {
- return Pred == ICmpInst::ICMP_EQ ? FalseVal : TrueVal;
- }
-
- return nullptr;
-}
-
/// Given that \p CondVal is known to be \p CondIsTrue, try to simplify \p SI.
static Value *simplifyNestedSelectsUsingImpliedCond(SelectInst &SI,
Value *CondVal,
@@ -4446,9 +4420,6 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
if (Instruction *PN = foldSelectToPhi(SI, DT, Builder))
return replaceInstUsesWith(SI, PN);
- if (Value *Fr = foldSelectWithFrozenICmp(SI, Builder))
- return replaceInstUsesWith(SI, Fr);
-
if (Value *V = foldRoundUpIntegerWithPow2Alignment(SI, Builder))
return replaceInstUsesWith(SI, V);
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 735bad1..e1dcaa85 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -883,84 +883,6 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
}
}
-struct WeightInfo {
- // Weights for current iteration.
- SmallVector<uint32_t> Weights;
- // Weights to subtract after each iteration.
- const SmallVector<uint32_t> SubWeights;
-};
-
-/// Update the branch weights of an exiting block of a peeled-off loop
-/// iteration.
-/// Let F is a weight of the edge to continue (fallthrough) into the loop.
-/// Let E is a weight of the edge to an exit.
-/// F/(F+E) is a probability to go to loop and E/(F+E) is a probability to
-/// go to exit.
-/// Then, Estimated ExitCount = F / E.
-/// For I-th (counting from 0) peeled off iteration we set the weights for
-/// the peeled exit as (EC - I, 1). It gives us reasonable distribution,
-/// The probability to go to exit 1/(EC-I) increases. At the same time
-/// the estimated exit count in the remainder loop reduces by I.
-/// To avoid dealing with division rounding we can just multiple both part
-/// of weights to E and use weight as (F - I * E, E).
-static void updateBranchWeights(Instruction *Term, WeightInfo &Info) {
- setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false);
- for (auto [Idx, SubWeight] : enumerate(Info.SubWeights))
- if (SubWeight != 0)
- // Don't set the probability of taking the edge from latch to loop header
- // to less than 1:1 ratio (meaning Weight should not be lower than
- // SubWeight), as this could significantly reduce the loop's hotness,
- // which would be incorrect in the case of underestimating the trip count.
- Info.Weights[Idx] =
- Info.Weights[Idx] > SubWeight
- ? std::max(Info.Weights[Idx] - SubWeight, SubWeight)
- : SubWeight;
-}
-
-/// Initialize the weights for all exiting blocks.
-static void initBranchWeights(DenseMap<Instruction *, WeightInfo> &WeightInfos,
- Loop *L) {
- SmallVector<BasicBlock *> ExitingBlocks;
- L->getExitingBlocks(ExitingBlocks);
- for (BasicBlock *ExitingBlock : ExitingBlocks) {
- Instruction *Term = ExitingBlock->getTerminator();
- SmallVector<uint32_t> Weights;
- if (!extractBranchWeights(*Term, Weights))
- continue;
-
- // See the comment on updateBranchWeights() for an explanation of what we
- // do here.
- uint32_t FallThroughWeights = 0;
- uint32_t ExitWeights = 0;
- for (auto [Succ, Weight] : zip(successors(Term), Weights)) {
- if (L->contains(Succ))
- FallThroughWeights += Weight;
- else
- ExitWeights += Weight;
- }
-
- // Don't try to update weights for degenerate case.
- if (FallThroughWeights == 0)
- continue;
-
- SmallVector<uint32_t> SubWeights;
- for (auto [Succ, Weight] : zip(successors(Term), Weights)) {
- if (!L->contains(Succ)) {
- // Exit weights stay the same.
- SubWeights.push_back(0);
- continue;
- }
-
- // Subtract exit weights on each iteration, distributed across all
- // fallthrough edges.
- double W = (double)Weight / (double)FallThroughWeights;
- SubWeights.push_back((uint32_t)(ExitWeights * W));
- }
-
- WeightInfos.insert({Term, {std::move(Weights), std::move(SubWeights)}});
- }
-}
-
/// Clones the body of the loop L, putting it between \p InsertTop and \p
/// InsertBot.
/// \param IterNumber The serial number of the iteration currently being
@@ -1332,11 +1254,6 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI,
Instruction *LatchTerm =
cast<Instruction>(cast<BasicBlock>(Latch)->getTerminator());
- // If we have branch weight information, we'll want to update it for the
- // newly created branches.
- DenseMap<Instruction *, WeightInfo> Weights;
- initBranchWeights(Weights, L);
-
// Identify what noalias metadata is inside the loop: if it is inside the
// loop, the associated metadata must be cloned for each iteration.
SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;
@@ -1382,11 +1299,6 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI,
assert(DT.verify(DominatorTree::VerificationLevel::Fast));
#endif
- for (auto &[Term, Info] : Weights) {
- auto *TermCopy = cast<Instruction>(VMap[Term]);
- updateBranchWeights(TermCopy, Info);
- }
-
// Remove Loop metadata from the latch branch instruction
// because it is not the Loop's latch branch anymore.
auto *LatchTermCopy = cast<Instruction>(VMap[LatchTerm]);
@@ -1426,15 +1338,38 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI,
}
}
- for (const auto &[Term, Info] : Weights) {
- setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false);
- }
-
// Update Metadata for count of peeled off iterations.
unsigned AlreadyPeeled = 0;
if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
AlreadyPeeled = *Peeled;
- addStringMetadataToLoop(L, PeeledCountMetaData, AlreadyPeeled + PeelCount);
+ unsigned TotalPeeled = AlreadyPeeled + PeelCount;
+ addStringMetadataToLoop(L, PeeledCountMetaData, TotalPeeled);
+
+ // Update metadata for the estimated trip count. The original branch weight
+ // metadata is already correct for both the remaining loop and the peeled loop
+ // iterations, so do not adjust it.
+ //
+ // For example, consider what happens when peeling 2 iterations from a loop
+ // with an estimated trip count of 10 and inserting them before the remaining
+ // loop. Each of the peeled iterations and each iteration in the remaining
+ // loop still has the same probability of exiting the *entire original* loop
+ // as it did when in the original loop, and thus it should still have the same
+ // branch weights. The peeled iterations' non-zero probabilities of exiting
+ // already appropriately reduce the probability of reaching the remaining
+ // iterations just as they did in the original loop. Trying to also adjust
+ // the remaining loop's branch weights to reflect its new trip count of 8 will
+ // erroneously further reduce its block frequencies. However, in case an
+ // analysis later needs to determine the trip count of the remaining loop
+ // while examining it in isolation without considering the probability of
+ // actually reaching it, we store the new trip count as separate metadata.
+ if (auto EstimatedTripCount = getLoopEstimatedTripCount(L)) {
+ unsigned EstimatedTripCountNew = *EstimatedTripCount;
+ if (EstimatedTripCountNew < TotalPeeled)
+ EstimatedTripCountNew = 0;
+ else
+ EstimatedTripCountNew -= TotalPeeled;
+ setLoopEstimatedTripCount(L, EstimatedTripCountNew);
+ }
if (Loop *ParentLoop = L->getParentLoop())
L = ParentLoop;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e5d6c81..7750687 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3903,7 +3903,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
if (VF.isScalar())
continue;
- VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
+ *CM.PSE.getSE());
precomputeCosts(*Plan, VF, CostCtx);
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4160,7 +4161,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
// Add on other costs that are modelled in VPlan, but not in the legacy
// cost model.
- VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
+ *CM.PSE.getSE());
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
assert(VectorRegion && "Expected to have a vector region!");
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -6852,7 +6854,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
// Now compute and add the VPlan-based cost.
@@ -7085,7 +7087,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// simplifications not accounted for in the legacy cost model. If that's the
// case, don't trigger the assertion, as the extra simplifications may cause a
// different VF to be picked by the VPlan-based cost model.
- VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
+ *CM.PSE.getSE());
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
// with early exits and plans with additional VPlan simplifications. The
@@ -7954,6 +7957,13 @@ bool VPRecipeBuilder::getScaledReductions(
auto CollectExtInfo = [this, &Exts, &ExtOpTypes,
&ExtKinds](SmallVectorImpl<Value *> &Ops) -> bool {
for (const auto &[I, OpI] : enumerate(Ops)) {
+ auto *CI = dyn_cast<ConstantInt>(OpI);
+ if (I > 0 && CI &&
+ canConstantBeExtended(CI, ExtOpTypes[0], ExtKinds[0])) {
+ ExtOpTypes[I] = ExtOpTypes[0];
+ ExtKinds[I] = ExtKinds[0];
+ continue;
+ }
Value *ExtOp;
if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
return false;
@@ -8614,7 +8624,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// TODO: Enable following transform when the EVL-version of extended-reduction
// and mulacc-reduction are implemented.
if (!CM.foldTailWithEVL()) {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
+ *CM.PSE.getSE());
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
CostCtx, Range);
}
@@ -10068,7 +10079,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
- CM.CostKind);
+ CM.CostKind, *CM.PSE.getSE());
if (!ForceVectorization &&
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
LVP.getPlanFor(VF.Width), SEL,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 02eb637..2555ebe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1753,6 +1753,16 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
}
#endif
+bool llvm::canConstantBeExtended(const ConstantInt *CI, Type *NarrowType,
+ TTI::PartialReductionExtendKind ExtKind) {
+ APInt TruncatedVal = CI->getValue().trunc(NarrowType->getScalarSizeInBits());
+ unsigned WideSize = CI->getType()->getScalarSizeInBits();
+ APInt ExtendedVal = ExtKind == TTI::PR_SignExtend
+ ? TruncatedVal.sext(WideSize)
+ : TruncatedVal.zext(WideSize);
+ return ExtendedVal == CI->getValue();
+}
+
TargetTransformInfo::OperandValueInfo
VPCostContext::getOperandInfo(VPValue *V) const {
if (!V->isLiveIn())
@@ -1762,7 +1772,8 @@ VPCostContext::getOperandInfo(VPValue *V) const {
}
InstructionCost VPCostContext::getScalarizationOverhead(
- Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
+ Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
+ bool AlwaysIncludeReplicatingR) {
if (VF.isScalar())
return 0;
@@ -1782,7 +1793,11 @@ InstructionCost VPCostContext::getScalarizationOverhead(
SmallPtrSet<const VPValue *, 4> UniqueOperands;
SmallVector<Type *> Tys;
for (auto *Op : Operands) {
- if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
+ if (Op->isLiveIn() ||
+ (!AlwaysIncludeReplicatingR &&
+ isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
+ (isa<VPReplicateRecipe>(Op) &&
+ cast<VPReplicateRecipe>(Op)->getOpcode() == Instruction::Load) ||
!UniqueOperands.insert(Op).second)
continue;
Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index fe59774..1580a3b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -349,12 +349,14 @@ struct VPCostContext {
LoopVectorizationCostModel &CM;
SmallPtrSet<Instruction *, 8> SkipCostComputation;
TargetTransformInfo::TargetCostKind CostKind;
+ ScalarEvolution &SE;
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
const VPlan &Plan, LoopVectorizationCostModel &CM,
- TargetTransformInfo::TargetCostKind CostKind)
+ TargetTransformInfo::TargetCostKind CostKind,
+ ScalarEvolution &SE)
: TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
- CostKind(CostKind) {}
+ CostKind(CostKind), SE(SE) {}
/// Return the cost for \p UI with \p VF using the legacy cost model as
/// fallback until computing the cost of all recipes migrates to VPlan.
@@ -374,10 +376,12 @@ struct VPCostContext {
/// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
/// and \p Operands with \p VF. This is a convenience wrapper for the
- /// type-based getScalarizationOverhead API.
- InstructionCost getScalarizationOverhead(Type *ResultTy,
- ArrayRef<const VPValue *> Operands,
- ElementCount VF);
+ /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
+ /// is true, always compute the cost of scalarizing replicating operands.
+ InstructionCost
+ getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
+ ElementCount VF,
+ bool AlwaysIncludeReplicatingR = false);
};
/// This class can be used to assign names to VPValues. For VPValues without
@@ -468,6 +472,10 @@ public:
};
#endif
+/// Check if a constant \p CI can be safely treated as having been extended
+/// from a narrower type with the given extension kind.
+bool canConstantBeExtended(const ConstantInt *CI, Type *NarrowType,
+ TTI::PartialReductionExtendKind ExtKind);
} // end namespace llvm
#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 46909a5..43d61f2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -40,6 +40,7 @@
#include <cassert>
using namespace llvm;
+using namespace llvm::VPlanPatternMatch;
using VectorParts = SmallVector<Value *, 2>;
@@ -303,7 +304,6 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
VPRecipeBase *OpR = Op->getDefiningRecipe();
// If the partial reduction is predicated, a select will be operand 0
- using namespace llvm::VPlanPatternMatch;
if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
OpR = Op->getDefiningRecipe();
}
@@ -340,6 +340,14 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
: Widen->getOperand(1));
ExtAType = GetExtendKind(ExtAR);
ExtBType = GetExtendKind(ExtBR);
+
+ if (!ExtBR && Widen->getOperand(1)->isLiveIn()) {
+ auto *CI = cast<ConstantInt>(Widen->getOperand(1)->getLiveInIRValue());
+ if (canConstantBeExtended(CI, InputTypeA, ExtAType)) {
+ InputTypeB = InputTypeA;
+ ExtBType = ExtAType;
+ }
+ }
};
if (isa<VPWidenCastRecipe>(OpR)) {
@@ -1955,7 +1963,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
VPValue *Op0, *Op1;
- using namespace llvm::VPlanPatternMatch;
if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
(match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
@@ -3103,6 +3110,62 @@ bool VPReplicateRecipe::shouldPack() const {
});
}
+/// Returns true if \p Ptr is a pointer computation for which the legacy cost
+/// model computes a SCEV expression when computing the address cost.
+static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
+ auto *PtrR = Ptr->getDefiningRecipe();
+ if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
+ cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
+ Instruction::GetElementPtr) ||
+ isa<VPWidenGEPRecipe>(PtrR) ||
+ match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
+ return false;
+
+ // We are looking for a GEP where all indices are either loop invariant or
+ // inductions.
+ for (VPValue *Opd : drop_begin(PtrR->operands())) {
+ if (!Opd->isDefinedOutsideLoopRegions() &&
+ !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
+ return false;
+ }
+
+ return true;
+}
+
+/// Returns true if \p V is used as part of the address of another load or
+/// store.
+static bool isUsedByLoadStoreAddress(const VPUser *V) {
+ SmallPtrSet<const VPUser *, 4> Seen;
+ SmallVector<const VPUser *> WorkList = {V};
+
+ while (!WorkList.empty()) {
+ auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
+ if (!Cur || !Seen.insert(Cur).second)
+ continue;
+
+ for (VPUser *U : Cur->users()) {
+ if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
+ if (InterleaveR->getAddr() == Cur)
+ return true;
+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
+ if (RepR->getOpcode() == Instruction::Load &&
+ RepR->getOperand(0) == Cur)
+ return true;
+ if (RepR->getOpcode() == Instruction::Store &&
+ RepR->getOperand(1) == Cur)
+ return true;
+ }
+ if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
+ if (MemR->getAddr() == Cur && MemR->isConsecutive())
+ return true;
+ }
+ }
+
+ append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
+ }
+ return false;
+}
+
InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3210,21 +3273,58 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
}
case Instruction::Load:
case Instruction::Store: {
- if (isSingleScalar()) {
- bool IsLoad = UI->getOpcode() == Instruction::Load;
- Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
- Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
- const Align Alignment = getLoadStoreAlignment(UI);
- unsigned AS = getLoadStoreAddressSpace(UI);
- TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
- InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
- UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
- return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
- ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
- }
+ if (VF.isScalable() && !isSingleScalar())
+ return InstructionCost::getInvalid();
+
// TODO: See getMemInstScalarizationCost for how to handle replicating and
// predicated cases.
- break;
+ const VPRegionBlock *ParentRegion = getParent()->getParent();
+ if (ParentRegion && ParentRegion->isReplicator())
+ break;
+
+ bool IsLoad = UI->getOpcode() == Instruction::Load;
+ const VPValue *PtrOp = getOperand(!IsLoad);
+ // TODO: Handle cases where we need to pass a SCEV to
+ // getAddressComputationCost.
+ if (shouldUseAddressAccessSCEV(PtrOp))
+ break;
+
+ Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
+ Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
+ const Align Alignment = getLoadStoreAlignment(UI);
+ unsigned AS = getLoadStoreAddressSpace(UI);
+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
+ InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
+ UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
+
+ Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
+
+ InstructionCost ScalarCost =
+ ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
+ PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);
+ if (isSingleScalar())
+ return ScalarCost;
+
+ SmallVector<const VPValue *> OpsToScalarize;
+ Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
+ // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
+ // don't assign scalarization overhead in general, if the target prefers
+ // vectorized addressing or the loaded value is used as part of an address
+ // of another load or store.
+ bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
+ if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) {
+ bool EfficientVectorLoadStore =
+ Ctx.TTI.supportsEfficientVectorElementLoadStore();
+ if (!(IsLoad && !PreferVectorizedAddressing) &&
+ !(!IsLoad && EfficientVectorLoadStore))
+ append_range(OpsToScalarize, operands());
+
+ if (!EfficientVectorLoadStore)
+ ResultTy = Ctx.Types.inferScalarType(this);
+ }
+
+ return (ScalarCost * VF.getFixedValue()) +
+ Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index acdb379..f76777b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1110,8 +1110,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
// x && !x -> 0
if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X)))))
- return Def->replaceAllUsesWith(Plan->getOrAddLiveIn(
- ConstantInt::getFalse(VPTypeAnalysis(*Plan).inferScalarType(Def))));
+ return Def->replaceAllUsesWith(Plan->getFalse());
if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
return Def->replaceAllUsesWith(X);
@@ -3346,12 +3345,7 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
}
- [[maybe_unused]] auto *ConstStep =
- ScalarStep->isLiveIn()
- ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())
- : nullptr;
- assert(!ConstStep || ConstStep->getValue() != 1);
- (void)ConstStep;
+ assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
ScalarStep =
Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
index a08f859..6d9aa8d 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
@@ -756,3 +756,129 @@ e.1:
e.2:
ret void
}
+
+define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_nofree_via_context(ptr %A, ptr %B) nosync {
+; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_nofree_via_context'
+; CHECK-NEXT: loop.header:
+; CHECK-NEXT: Memory dependences are safe with run-time checks
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Check 0:
+; CHECK-NEXT: Comparing group GRP0:
+; CHECK-NEXT: %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+; CHECK-NEXT: Against group GRP1:
+; CHECK-NEXT: %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+; CHECK-NEXT: Grouped accesses:
+; CHECK-NEXT: Group GRP0:
+; CHECK-NEXT: (Low: %B High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT: Member: {%B,+,4}<nuw><%loop.header>
+; CHECK-NEXT: Group GRP1:
+; CHECK-NEXT: (Low: %A High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT: Member: {%A,+,4}<nuw><%loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+;
+entry:
+ call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %A, i64 2000) ]
+ call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %B, i64 2000) ]
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+ %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+ %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+ %l = load i32, ptr %gep.A, align 4
+ store i32 0, ptr %gep.B, align 4
+ %cntable.c.1 = icmp ult i64 %iv, 1000
+ %iv.next = add nuw nsw i64 %iv, 1
+ br i1 %cntable.c.1, label %b2, label %e.1
+
+b2:
+ %uncntable.c.0 = icmp eq i32 %l, 0
+ br i1 %uncntable.c.0, label %e.2, label %b3
+
+b3:
+ %cntable.c.2 = icmp eq i64 %iv.next, 500
+ br i1 %cntable.c.2, label %cleanup4, label %latch
+
+latch:
+ br label %loop.header
+
+cleanup4:
+ ret void
+
+e.1:
+ ret void
+
+e.2:
+ ret void
+}
+
+define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_missing_nofree_multiple_predecessors(ptr %A, ptr %B, i1 %c) nosync {
+; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_missing_nofree_multiple_predecessors'
+; CHECK-NEXT: loop.header:
+; CHECK-NEXT: Memory dependences are safe with run-time checks
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Check 0:
+; CHECK-NEXT: Comparing group GRP0:
+; CHECK-NEXT: %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+; CHECK-NEXT: Against group GRP1:
+; CHECK-NEXT: %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+; CHECK-NEXT: Grouped accesses:
+; CHECK-NEXT: Group GRP0:
+; CHECK-NEXT: (Low: %B High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT: Member: {%B,+,4}<nuw><%loop.header>
+; CHECK-NEXT: Group GRP1:
+; CHECK-NEXT: (Low: %A High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT: Member: {%A,+,4}<nuw><%loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+;
+entry:
+ call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %A, i64 2000) ]
+ call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %B, i64 2000) ]
+ br i1 %c, label %then, label %else
+
+then:
+ br label %loop.header
+
+else:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ 0, %then ], [ 0, %else ], [ %iv.next, %latch ]
+ %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+ %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+ %l = load i32, ptr %gep.A, align 4
+ store i32 0, ptr %gep.B, align 4
+ %cntable.c.1 = icmp ult i64 %iv, 1000
+ %iv.next = add nuw nsw i64 %iv, 1
+ br i1 %cntable.c.1, label %b2, label %e.1
+
+b2:
+ %uncntable.c.0 = icmp eq i32 %l, 0
+ br i1 %uncntable.c.0, label %e.2, label %b3
+
+b3:
+ %cntable.c.2 = icmp eq i64 %iv.next, 500
+ br i1 %cntable.c.2, label %cleanup4, label %latch
+
+latch:
+ br label %loop.header
+
+cleanup4:
+ ret void
+
+e.1:
+ ret void
+
+e.2:
+ ret void
+}
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 32c7c64..e810fcb6 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -247,7 +247,7 @@ if (LLVM_INCLUDE_SPIRV_TOOLS_TESTS)
list(APPEND LLVM_TEST_DEPENDS spirv-link)
endif()
-add_custom_target(llvm-test-depends DEPENDS ${LLVM_TEST_DEPENDS})
+add_custom_target(llvm-test-depends DEPENDS ${LLVM_TEST_DEPENDS} UnitTests)
set_target_properties(llvm-test-depends PROPERTIES FOLDER "LLVM/Tests")
if(LLVM_BUILD_TOOLS)
@@ -259,7 +259,7 @@ endif()
add_lit_testsuite(check-llvm "Running the LLVM regression tests"
${CMAKE_CURRENT_BINARY_DIR}
${exclude_from_check_all}
- DEPENDS ${LLVM_TEST_DEPENDS}
+ DEPENDS ${LLVM_TEST_DEPENDS} UnitTests
)
set_target_properties(check-llvm PROPERTIES FOLDER "LLVM/Tests")
diff --git a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
index aca2816..7fd0cee 100644
--- a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
+++ b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
@@ -164,10 +164,10 @@ stack:
- { id: 1, name: z1.addr, size: 16, alignment: 16, stack-id: scalable-vector,
debug-info-variable: '!31', debug-info-expression: '!DIExpression()',
debug-info-location: '!32' }
- - { id: 2, name: p0.addr, size: 2, alignment: 2, stack-id: scalable-vector,
+ - { id: 2, name: p0.addr, size: 2, alignment: 2, stack-id: scalable-predicate-vector,
debug-info-variable: '!33', debug-info-expression: '!DIExpression()',
debug-info-location: '!34' }
- - { id: 3, name: p1.addr, size: 2, alignment: 2, stack-id: scalable-vector,
+ - { id: 3, name: p1.addr, size: 2, alignment: 2, stack-id: scalable-predicate-vector,
debug-info-variable: '!35', debug-info-expression: '!DIExpression()',
debug-info-location: '!36' }
- { id: 4, name: w0.addr, size: 4, alignment: 4, local-offset: -4, debug-info-variable: '!37',
@@ -181,10 +181,10 @@ stack:
- { id: 7, name: localv1, size: 16, alignment: 16, stack-id: scalable-vector,
debug-info-variable: '!45', debug-info-expression: '!DIExpression()',
debug-info-location: '!46' }
- - { id: 8, name: localp0, size: 2, alignment: 2, stack-id: scalable-vector,
+ - { id: 8, name: localp0, size: 2, alignment: 2, stack-id: scalable-predicate-vector,
debug-info-variable: '!48', debug-info-expression: '!DIExpression()',
debug-info-location: '!49' }
- - { id: 9, name: localp1, size: 2, alignment: 2, stack-id: scalable-vector,
+ - { id: 9, name: localp1, size: 2, alignment: 2, stack-id: scalable-predicate-vector,
debug-info-variable: '!51', debug-info-expression: '!DIExpression()',
debug-info-location: '!52' }
machineFunctionInfo: {}
diff --git a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
index 0ea180b..41ba554 100644
--- a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
+++ b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
@@ -96,8 +96,8 @@ stack:
- { id: 1, size: 8, alignment: 8 }
- { id: 2, size: 16, alignment: 16, stack-id: scalable-vector }
- { id: 3, size: 16, alignment: 16, stack-id: scalable-vector }
- - { id: 4, size: 2, alignment: 2, stack-id: scalable-vector }
- - { id: 5, size: 2, alignment: 2, stack-id: scalable-vector }
+ - { id: 4, size: 2, alignment: 2, stack-id: scalable-predicate-vector }
+ - { id: 5, size: 2, alignment: 2, stack-id: scalable-predicate-vector }
machineFunctionInfo: {}
body: |
bb.0.entry:
diff --git a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
new file mode 100644
index 0000000..35eafe8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
@@ -0,0 +1,587 @@
+# RUN: llc -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -start-before=prologepilog %s -o - | FileCheck %s --check-prefix=ASM
+# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -start-before=prologepilog %s -filetype=obj -o %t
+# RUN: llvm-objdump --dwarf=frames %t | FileCheck %s --check-prefix=UNWINDINFO
+# RUN: rm -rf %t
+#
+# Test allocation and deallocation of SVE objects on the stack with
+# split-sve-objects (and hazard padding) enabled. This also tests using a
+# combination of scalable and non-scalable offsets to access the SVE on the
+# stack.
+#
+# With split-sve-objects (which implies hazard padding) the SVE area is split
+# into PPR and ZPR areas with (fixed-size) hazard padding between them. The PPR
+# area holds all scalable predicate callee saves and locals, and the ZPR area
+# holds all scalable vector callee saves and locals. Additionally, any FPR
+# callee save is promoted to a ZPR callee save (to avoid needing additional
+# hazard padding in the callee save area).
+#
+# +-------------+
+# | stack arg |
+# +-------------+ <- SP before call
+# | Callee Saves|
+# | Frame record| (if available)
+# |-------------| <- FP (if available)
+# | PPR area |
+# |-------------|
+# |/////////////| hazard padding
+# |-------------|
+# | ZPR area |
+# +-------------+
+# | : |
+# | Stack objs |
+# | : |
+# +-------------+ <- SP after call and frame-setup
+#
+--- |
+
+ define void @test_allocate_split_sve() uwtable { entry: unreachable }
+ define void @test_allocate_split_sve_realigned() uwtable { entry: unreachable }
+ define void @test_address_split_sve() uwtable { entry: unreachable }
+ define void @test_address_split_sve_fp() uwtable { entry: unreachable }
+ define aarch64_sve_vector_pcs void @save_restore_ppr_zpr() uwtable { entry: unreachable }
+
+...
+---
+# +----------+
+# |scratchreg| // x29 is used as scratch reg.
+# |----------|
+# | %stack.0 | // scalable predicate of n * 12 bytes, aligned to 16 bytes
+# | | // to be materialized with 1*ADDVL (<=> n * 16 bytes)
+# |----------|
+# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area
+# |//////////| // Note: This is currently not included in the "stackSize"
+# +----------+
+# | %stack.0 | // scalable SVE object of n * 18 bytes, aligned to 16 bytes,
+# | | // to be materialized with 2*ADDVL (<=> 2 * n * 16 bytes)
+# +----------+
+# |//////////| // hazard padding (1024 bytes)
+# |----------|
+# | %stack.1 | // not scalable
+# +----------+ <- SP
+
+# CHECK-LABEL: name: test_allocate_split_sve
+# CHECK: stackSize: 1056
+
+# CHECK: bb.0.entry:
+# CHECK: liveins: $z0, $p0, $fp
+# CHECK: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.4)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
+#
+# CHECK-NEXT: $x8 = ADDXri $sp, 1040, 0
+# CHECK-NEXT: $x8 = ADDPL_XXI $x8, 7, implicit $vg
+# CHECK-NEXT: STR_ZXI $z0, killed $x8, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+# CHECK-NEXT: $x8 = ADDXri $sp, 2064, 0
+# CHECK-NEXT: STR_PXI $p0, killed $x8, 18 :: (store (<vscale x 1 x s16>) into %stack.1)
+#
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.4)
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: test_allocate_split_sve:
+# ASM: str x29, [sp, #-16]!
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: .cfi_offset w29, -16
+# ASM-NEXT: sub sp, sp, #1024
+# ASM-NEXT: .cfi_def_cfa_offset 1040
+# ASM-NEXT: addvl sp, sp, #-1
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
+# ASM-NEXT: sub sp, sp, #1040
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+# ASM-NEXT: addvl sp, sp, #-2
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG
+#
+# ASM: addvl sp, sp, #2
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+# ASM-NEXT: add sp, sp, #1024
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG
+# ASM-NEXT: addvl sp, sp, #1
+# ASM-NEXT: .cfi_def_cfa wsp, 1056
+# ASM-NEXT: add sp, sp, #1040
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: ldr x29, [sp], #16
+# ASM-NEXT: .cfi_def_cfa_offset 0
+# ASM-NEXT: .cfi_restore w29
+
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
+#
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+
+name: test_allocate_split_sve
+stack:
+ - { id: 0, stack-id: scalable-vector, size: 18, alignment: 2 }
+ - { id: 1, stack-id: scalable-vector, size: 12, alignment: 2 }
+ - { id: 2, stack-id: default, size: 16, alignment: 8 }
+body: |
+ bb.0.entry:
+ liveins: $z0, $p0
+ STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+ STR_PXI $p0, %stack.1, 0 :: (store (<vscale x 1 x s16>) into %stack.1)
+ RET_ReallyLR
+...
+---
+
+# Stack realignment is not supported with split-sve-objects, so we fallback to
+# the default hazard padding implementation. This does not prevent hazards
+# between ZPRs and PPRs (TODO: support this case).
+#
+# +----------+
+# | lr, fp | // frame record
+# |----------|
+# |//////////| // hazard padding (1024 bytes)
+# |----------|
+# | %stack.0 | // scalable predicate of n * 12 bytes, aligned to 16 bytes
+# | | // to be materialized with 1*ADDVL (<=> n * 16 bytes)
+# +----------+
+# | %stack.0 | // scalable SVE object of n * 18 bytes, aligned to 16 bytes,
+# | | // to be materialized with 2*ADDVL (<=> 2 * n * 16 bytes)
+# +----------+
+# |//////////| // hazard padding (1024 bytes)
+# |----------|
+# | %stack.1 | // not scalable
+# +----------+ <- SP
+
+name: test_allocate_split_sve_realigned
+stack:
+ - { id: 0, stack-id: scalable-vector, size: 18, alignment: 2 }
+ - { id: 1, stack-id: scalable-vector, size: 12, alignment: 2 }
+ - { id: 2, stack-id: default, size: 16, alignment: 32 }
+body: |
+ bb.0.entry:
+ liveins: $z0, $p0
+ STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+ STR_PXI $p0, %stack.1, 0 :: (store (<vscale x 1 x s16>) into %stack.1)
+ RET_ReallyLR
+
+# CHECK-LABEL: name: test_allocate_split_sve_realigned
+# CHECK: stackSize: 2080
+
+# CHECK: bb.0.entry:
+# CHECK: liveins: $z0, $p0, $lr
+# CHECK: $sp = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
+# CHECK-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.5)
+# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 129 :: (store (s64) into %stack.4)
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 1024, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2, implicit $vg
+# CHECK-NEXT: $sp = frame-setup ANDXri killed $x9, 7930
+#
+# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
+# CHECK-NEXT: $x8 = ADDPL_XXI $x8, -1, implicit $vg
+# CHECK-NEXT: STR_ZXI $z0, killed $x8, -1 :: (store (<vscale x 1 x s128>) into %stack.0)
+# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
+# CHECK-NEXT: STR_PXI $p0, killed $x8, -15 :: (store (<vscale x 1 x s16>) into %stack.1)
+#
+# CHECK-NEXT: $sp = frame-destroy SUBXri $fp, 1024, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1040
+# CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 129 :: (load (s64) from %stack.4)
+# CHECK-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.5)
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: test_allocate_split_sve_realigned
+# ASM: sub sp, sp, #1040
+# ASM-NEXT: .cfi_def_cfa_offset 1040
+# ASM-NEXT: str x29, [sp, #1024]
+# ASM-NEXT: str x30, [sp, #1032]
+# ASM-NEXT: add x29, sp, #1024
+# ASM-NEXT: .cfi_def_cfa w29, 16
+# ASM-NEXT: .cfi_offset w30, -8
+# ASM-NEXT: .cfi_offset w29, -16
+#
+# ASM: sub sp, x29, #1024
+# ASM-NEXT: .cfi_def_cfa wsp, 1040
+# ASM-NEXT: ldr x30, [sp, #1032]
+# ASM-NEXT: ldr x29, [sp, #1024]
+# ASM-NEXT: add sp, sp, #1040
+# ASM-NEXT: .cfi_def_cfa_offset 0
+# ASM-NEXT: .cfi_restore w30
+# ASM-NEXT: .cfi_restore w29
+
+# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
+# UNWINDINFO: DW_CFA_def_cfa: reg29 +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+#
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +1040
+# UNWINDINFO: DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg30
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+...
+---
+
+# +----------+
+# |scratchreg| // x29 is used as scratch reg.
+# +----------+
+# | %stack.2 | // scalable predicate @ SP + 2064b + 46 scalable bytes
+# |----------|
+# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area
+# |//////////| // Note: This is currently not included in the "stackSize"
+# |----------|
+# | %stack.0 | // scalable vector @ SP + 1040b + 16 scalable bytes
+# | %stack.1 | // scalable vector @ SP + 1040b
+# +----------+
+# |//////////| // hazard padding (1024 bytes)
+# |----------|
+# | %stack.3 | // not scalable
+# +----------+ <- SP
+
+# CHECK-LABEL: name: test_address_split_sve
+# CHECK: stackSize: 1056
+
+# CHECK: bb.0.entry:
+# CHECK-NEXT: liveins:
+# CHECK-NEXT: {{ $}}
+# CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.5)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
+#
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0
+# CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], 1
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0
+# CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], 0
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 2064, 0
+# CHECK-NEXT: STR_PXI $p0, killed $[[TMP]], 23
+#
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.5)
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: test_address_split_sve
+# ASM: str x29, [sp, #-16]!
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: .cfi_offset w29, -16
+# ASM-NEXT: sub sp, sp, #1024
+# ASM-NEXT: .cfi_def_cfa_offset 1040
+# ASM-NEXT: addvl sp, sp, #-1
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
+# ASM-NEXT: sub sp, sp, #1040
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+# ASM-NEXT: addvl sp, sp, #-2
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG
+#
+# ASM: addvl sp, sp, #2
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+# ASM-NEXT: add sp, sp, #1024
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG
+# ASM-NEXT: addvl sp, sp, #1
+# ASM-NEXT: .cfi_def_cfa wsp, 1056
+# ASM-NEXT: add sp, sp, #1040
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: ldr x29, [sp], #16
+# ASM-NEXT: .cfi_def_cfa_offset 0
+# ASM-NEXT: .cfi_restore w29
+
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
+#
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+
+name: test_address_split_sve
+frameInfo:
+ maxAlignment: 16
+stack:
+ - { id: 0, stack-id: scalable-vector, size: 16, alignment: 8 }
+ - { id: 1, stack-id: scalable-vector, size: 16, alignment: 8 }
+ - { id: 2, stack-id: scalable-vector, size: 2, alignment: 2 }
+ - { id: 3, stack-id: default, size: 16, alignment: 8 }
+body: |
+ bb.0.entry:
+ liveins: $z0, $z1, $p0
+
+ STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+ STR_ZXI $z1, %stack.1, 0 :: (store (<vscale x 1 x s128>) into %stack.1)
+ STR_PXI $p0, %stack.2, 0 :: (store (<vscale x 1 x s16>) into %stack.2)
+
+ RET_ReallyLR
+...
+---
+# +----------+
+# | lr, fp | // frame record
+# +----------+ <- FP
+# | %stack.2 | // scalable predicate @ FP - 2 scalable bytes
+# |----------|
+# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area
+# |//////////| // Note: This is currently not included in the "stackSize"
+# |----------|
+# | %stack.0 | // scalable vector @ FP - 1024b - 32 scalable bytes
+# | %stack.1 | // scalable vector @ FP - 1024b - 48 scalable bytes
+# +----------+
+# |//////////| // hazard padding (1024 bytes)
+# |----------|
+# | %stack.3 | // not scalable
+# +----------+ <- SP
+
+# CHECK-LABEL: name: test_address_split_sve_fp
+# CHECK: stackSize: 1056
+#
+# CHECK: bb.0.entry:
+# CHECK-NEXT: liveins:
+# CHECK-NEXT: {{ $}}
+# CHECK-NEXT: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.6), (store (s64) into %stack.5)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+#
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0
+# CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], -2
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0
+# CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], -3
+# CHECK-NEXT: STR_PXI $p0, $fp, -1
+#
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
+# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.6), (load (s64) from %stack.5)
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: test_address_split_sve_fp
+# ASM: stp x29, x30, [sp, #-16]!
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: mov x29, sp
+# ASM-NEXT: .cfi_def_cfa w29, 16
+# ASM-NEXT: .cfi_offset w30, -8
+# ASM-NEXT: .cfi_offset w29, -16
+# ASM-NEXT: sub sp, sp, #1024
+# ASM-NEXT: addvl sp, sp, #-1
+# ASM-NEXT: sub sp, sp, #1040
+# ASM-NEXT: addvl sp, sp, #-2
+#
+# ASM: addvl sp, sp, #2
+# ASM-NEXT: add sp, sp, #1024
+# ASM-NEXT: addvl sp, sp, #1
+# ASM-NEXT: add sp, sp, #1040
+# ASM-NEXT: .cfi_def_cfa wsp, 16
+# ASM-NEXT: ldp x29, x30, [sp], #16
+# ASM-NEXT: .cfi_def_cfa_offset 0
+# ASM-NEXT: .cfi_restore w30
+# ASM-NEXT: .cfi_restore w29
+
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO: DW_CFA_def_cfa: reg29 +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+#
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg30
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+
+name: test_address_split_sve_fp
+frameInfo:
+ maxAlignment: 16
+ isFrameAddressTaken: true
+stack:
+ - { id: 0, stack-id: scalable-vector, size: 16, alignment: 8 }
+ - { id: 1, stack-id: scalable-vector, size: 16, alignment: 8 }
+ - { id: 2, stack-id: scalable-vector, size: 2, alignment: 2 }
+ - { id: 3, stack-id: default, size: 16, alignment: 8 }
+body: |
+ bb.0.entry:
+ liveins: $z0, $z1, $p0
+
+ STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+ STR_ZXI $z1, %stack.1, 0 :: (store (<vscale x 1 x s128>) into %stack.1)
+ STR_PXI $p0, %stack.2, 0 :: (store (<vscale x 1 x s16>) into %stack.2)
+
+ RET_ReallyLR
+...
+---
+# CHECK-LABEL: name: save_restore_ppr_zpr
+# CHECK: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.8)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: frame-setup STR_PXI killed $p6, $sp, 5 :: (store (s16) into %stack.7)
+# CHECK-NEXT: frame-setup STR_PXI killed $p5, $sp, 6 :: (store (s16) into %stack.6)
+# CHECK-NEXT: frame-setup STR_PXI killed $p4, $sp, 7 :: (store (s16) into %stack.5)
+#
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+#
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22
+# CHECK-NEXT: frame-setup STR_ZXI killed $z10, $sp, 0 :: (store (s128) into %stack.4)
+# CHECK-NEXT: frame-setup STR_ZXI killed $z9, $sp, 1 :: (store (s128) into %stack.3)
+# CHECK-NEXT: frame-setup STR_ZXI killed $z8, $sp, 2 :: (store (s128) into %stack.2)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1056, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22
+#
+#
+# CHECK: $sp = frame-destroy ADDXri $sp, 1056, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22
+# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.4)
+# CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.3)
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.2)
+#
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22
+#
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z9
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z10
+# CHECK-NEXT: $p6 = frame-destroy LDR_PXI $sp, 5 :: (load (s16) from %stack.7)
+# CHECK-NEXT: $p5 = frame-destroy LDR_PXI $sp, 6 :: (load (s16) from %stack.6)
+# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7 :: (load (s16) from %stack.5)
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
+# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.8)
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: save_restore_ppr_zpr:
+# ASM: str x29, [sp, #-16]!
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: .cfi_offset w29, -16
+# ASM-NEXT: addvl sp, sp, #-1
+# ASM-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
+# ASM-NEXT: str p6, [sp, #5, mul vl]
+# ASM-NEXT: str p5, [sp, #6, mul vl]
+# ASM-NEXT: str p4, [sp, #7, mul vl]
+# ASM-NEXT: sub sp, sp, #1024
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
+# ASM-NEXT: addvl sp, sp, #-3
+# ASM-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 1040 + 32 * VG
+# ASM-NEXT: str z10, [sp]
+# ASM-NEXT: str z9, [sp, #1, mul vl]
+# ASM-NEXT: str z8, [sp, #2, mul vl]
+# ASM-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1040
+# ASM-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1040
+# ASM-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1040
+# ASM-NEXT: sub sp, sp, #1056
+# ASM-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 2096 + 32 * VG
+#
+# ASM: add sp, sp, #1056
+# ASM-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 1040 + 32 * VG
+# ASM-NEXT: ldr z10, [sp]
+# ASM-NEXT: ldr z9, [sp, #1, mul vl]
+# ASM-NEXT: ldr z8, [sp, #2, mul vl]
+# ASM-NEXT: add sp, sp, #1024
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 16 + 32 * VG
+# ASM-NEXT: addvl sp, sp, #3
+# ASM-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
+# ASM-NEXT: .cfi_restore z8
+# ASM-NEXT: .cfi_restore z9
+# ASM-NEXT: .cfi_restore z10
+# ASM-NEXT: ldr p6, [sp, #5, mul vl]
+# ASM-NEXT: ldr p5, [sp, #6, mul vl]
+# ASM-NEXT: ldr p4, [sp, #7, mul vl]
+# ASM-NEXT: addvl sp, sp, #1
+# ASM-NEXT: .cfi_def_cfa wsp, 16
+# ASM-NEXT: ldr x29, [sp], #16
+# ASM-NEXT: .cfi_def_cfa_offset 0
+# ASM-NEXT: .cfi_restore w29
+
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_expression: reg72 DW_OP_bregx 0x2e +0, DW_OP_consts -16, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus
+# UNWINDINFO: DW_CFA_expression: reg73 DW_OP_bregx 0x2e +0, DW_OP_consts -24, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus
+# UNWINDINFO: DW_CFA_expression: reg74 DW_OP_bregx 0x2e +0, DW_OP_consts -32, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2096, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus
+#
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg104
+# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg105
+# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg106
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+
+name: save_restore_ppr_zpr
+stack:
+ - { id: 0, stack-id: default, size: 32, alignment: 16 }
+body: |
+ bb.0.entry:
+
+ $p4 = IMPLICIT_DEF
+ $p5 = IMPLICIT_DEF
+ $p6 = IMPLICIT_DEF
+ $z8 = IMPLICIT_DEF
+ $z9 = IMPLICIT_DEF
+ $z10 = IMPLICIT_DEF
+
+ RET_ReallyLR
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 03a6aab..1101416 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -1215,19 +1215,19 @@ body: |
# CHECK: - { id: 2, name: '', type: default, offset: -112, size: 16, alignment: 16,
# CHECK-NEXT: stack-id: scalable-vector,
# CHECK: - { id: 3, name: '', type: default, offset: -114, size: 2, alignment: 2,
-# CHECK-NEXT: stack-id: scalable-vector,
+# CHECK-NEXT: stack-id: scalable-predicate-vector,
# CHECK: - { id: 4, name: '', type: spill-slot, offset: -144, size: 16, alignment: 16,
# CHECK-NEXT: stack-id: scalable-vector,
# CHECK: - { id: 5, name: '', type: spill-slot, offset: -146, size: 2, alignment: 2,
-# CHECK-NEXT: stack-id: scalable-vector,
+# CHECK-NEXT: stack-id: scalable-predicate-vector,
# CHECK: - { id: 6, name: '', type: spill-slot, offset: -16, size: 16, alignment: 16,
# CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '$z8',
# CHECK: - { id: 7, name: '', type: spill-slot, offset: -32, size: 16, alignment: 16,
# CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '$z23',
# CHECK: - { id: 8, name: '', type: spill-slot, offset: -34, size: 2, alignment: 2,
-# CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '$p4',
+# CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: '$p4',
# CHECK: - { id: 9, name: '', type: spill-slot, offset: -36, size: 2, alignment: 2,
-# CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '$p15',
+# CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: '$p15',
# CHECK: - { id: 10, name: '', type: spill-slot, offset: -16, size: 8, alignment: 16,
# CHECK-NEXT: stack-id: default, callee-saved-register: '$fp',
#
@@ -1295,9 +1295,9 @@ stack:
- { id: 0, type: default, size: 32, alignment: 16, stack-id: scalable-vector }
- { id: 1, type: default, size: 4, alignment: 2, stack-id: scalable-vector }
- { id: 2, type: default, size: 16, alignment: 16, stack-id: scalable-vector }
- - { id: 3, type: default, size: 2, alignment: 2, stack-id: scalable-vector }
+ - { id: 3, type: default, size: 2, alignment: 2, stack-id: scalable-predicate-vector }
- { id: 4, type: spill-slot, size: 16, alignment: 16, stack-id: scalable-vector }
- - { id: 5, type: spill-slot, size: 2, alignment: 2, stack-id: scalable-vector }
+ - { id: 5, type: spill-slot, size: 2, alignment: 2, stack-id: scalable-predicate-vector }
body: |
bb.0.entry:
diff --git a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
index bff0cac..0298168 100644
--- a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
+++ b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
@@ -983,26 +983,22 @@ body: |
; EXPAND-LABEL: name: zpr_predicate_spill_p4_saved
; EXPAND: liveins: $p0, $p1, $p2, $p3, $fp, $p8, $p4
; EXPAND-NEXT: {{ $}}
- ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
- ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.3)
+ ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2)
; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
- ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.2)
+ ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.1)
; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
- ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.1)
- ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+ ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.0)
;
; EXPAND-NEXT: $p8 = IMPLICIT_DEF
;
- ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
- ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.2)
+ ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.1)
; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
- ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.1)
+ ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.0)
; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
- ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.3)
- ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+ ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2)
; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $p1, implicit $p2, implicit $p3
; If we spill a register above p8, p4 must also be saved, so we can guarantee
diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
index 2b16dd0f..5569175 100644
--- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir
+++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
@@ -39,7 +39,7 @@ body: |
; CHECK-LABEL: name: spills_fills_stack_id_ppr
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2
- ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: ''
+ ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: ''
; EXPAND-LABEL: name: spills_fills_stack_id_ppr
; EXPAND: STR_PXI $p0, $sp, 7
@@ -82,7 +82,7 @@ body: |
; CHECK-LABEL: name: spills_fills_stack_id_ppr2
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2
- ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: ''
+ ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: ''
; EXPAND-LABEL: name: spills_fills_stack_id_ppr2
; EXPAND: STR_PXI $p0, $sp, 6
@@ -127,7 +127,7 @@ body: |
; CHECK-LABEL: name: spills_fills_stack_id_ppr2
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2
- ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: ''
+ ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: ''
; EXPAND-LABEL: name: spills_fills_stack_id_ppr2mul2
; EXPAND: STR_PXI $p0, $sp, 6
@@ -172,7 +172,7 @@ body: |
; CHECK-LABEL: name: spills_fills_stack_id_pnr
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2
- ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: ''
+ ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: ''
; EXPAND-LABEL: name: spills_fills_stack_id_pnr
; EXPAND: STR_PXI $pn0, $sp, 7
@@ -211,7 +211,7 @@ body: |
; CHECK-LABEL: name: spills_fills_stack_id_virtreg_pnr
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2
- ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: ''
+ ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: ''
; EXPAND-LABEL: name: spills_fills_stack_id_virtreg_pnr
; EXPAND: renamable $pn8 = WHILEGE_CXX_B
diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
new file mode 100644
index 0000000..690a39d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
@@ -0,0 +1,824 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -pass-remarks-analysis=stack-frame-layout 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK-FRAMELAYOUT
+
+; CHECK-FRAMELAYOUT-LABEL: Function: zpr_and_ppr_local
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024
+
+; <GPRs>
+; %ppr_local sp+2048+30*vscale (= #15, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding sp+2048+16*vscale
+; <hazard padding> sp+1024+16*vscale
+; %zpr_local sp+1024
+; <hazard padding>
+; -> sp
+define void @zpr_and_ppr_local(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: zpr_and_ppr_local:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: add x8, sp, #2048
+; CHECK-NEXT: str p0, [x8, #15, mul vl]
+; CHECK-NEXT: add x8, sp, #1024
+; CHECK-NEXT: str z0, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: zpr_and_ppr_local_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024
+
+; <GPRs>
+; -> fp
+; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding fp-16*vscale
+; <hazard padding> fp-1024-16*vscale
+; %zpr_local fp-1024-32*vscale (= #-2, mul vl for str/ldr ZPR)
+; <hazard padding>
+; -> sp
+define void @zpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
+; CHECK-LABEL: zpr_and_ppr_local_fp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: sub x8, x29, #1024
+; CHECK-NEXT: str p0, [x29, #-1, mul vl]
+; CHECK-NEXT: str z0, [x8, #-2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: fpr_and_ppr_local
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-16 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-16 x vscale], Type: Variable, Align: 16, Size: 1024
+
+; <GPRs>
+; %ppr_local sp+2064+14*vscale (= #7, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding sp+2064
+; <hazard padding> sp+1040
+; %fpr_local sp+1032
+; 8 bytes of padding sp+1024
+; <hazard padding>
+; -> sp
+define void @fpr_and_ppr_local(<vscale x 16 x i1> %pred, double %double) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: fpr_and_ppr_local:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: sub sp, sp, #1040
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: add x8, sp, #2064
+; CHECK-NEXT: str p0, [x8, #7, mul vl]
+; CHECK-NEXT: str d0, [sp, #1032]
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1040
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %fpr_local = alloca double
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile double %double, ptr %fpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: fpr_and_ppr_local_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-16 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-16 x vscale], Type: Variable, Align: 16, Size: 1024
+
+; <GPRs>
+; -> fp
+; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding
+; <hazard padding>
+; %fpr_local sp+1032
+; 8 bytes of padding sp+1024
+; <hazard padding>
+; -> sp
+define void @fpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, double %double) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
+; CHECK-LABEL: fpr_and_ppr_local_fp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: sub sp, sp, #1040
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: str p0, [x29, #-1, mul vl]
+; CHECK-NEXT: str d0, [sp, #1032]
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1040
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %fpr_local = alloca double
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile double %double, ptr %fpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: gpr_and_ppr_local
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2072-32 x vscale], Type: Variable, Align: 8, Size: 8
+
+; <CS GPRs>
+; %ppr_local sp+2064+30*vscale (= #15, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding
+; <hazard padding> sp+1040+16*vscale
+; <fpr callee save: z8> sp+1040
+; <hazard padding> sp+16
+; %gpr_local sp+8
+; 8 bytes of padding
+; -> sp
+define void @gpr_and_ppr_local(<vscale x 16 x i1> %pred, i64 %int) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: gpr_and_ppr_local:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1040
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2080 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1040
+; CHECK-NEXT: add x8, sp, #2064
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: str p0, [x8, #15, mul vl]
+; CHECK-NEXT: str x0, [sp, #8]
+; CHECK-NEXT: add sp, sp, #1040
+; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ tail call void asm sideeffect "", "~{d8}"() #1 ; Spill an FPR so hazard padding is needed
+ %ppr_local = alloca <vscale x 16 x i1>
+ %gpr_local = alloca i64
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile i64 %int, ptr %gpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: gpr_and_ppr_local_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2072-32 x vscale], Type: Variable, Align: 8, Size: 8
+
+; <CS GPRs>
+; -> fp
+; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding
+; <hazard padding>
+; <fpr callee save: z8>
+; <hazard padding>
+; %gpr_local sp+8
+; 8 bytes of padding
+; -> sp
+define void @gpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, i64 %int) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
+; CHECK-LABEL: gpr_and_ppr_local_fp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1040
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1040
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: str p0, [x29, #-1, mul vl]
+; CHECK-NEXT: str x0, [sp, #8]
+; CHECK-NEXT: add sp, sp, #1040
+; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ tail call void asm sideeffect "", "~{d8}"() #1 ; Spill an FPR so hazard padding is needed
+ %ppr_local = alloca <vscale x 16 x i1>
+ %gpr_local = alloca i64
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile i64 %int, ptr %gpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: all_stack_areas
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-34 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-304 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-320 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-320 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-320 x vscale], Type: Variable, Align: 16, Size: 1024
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2088-320 x vscale], Type: Variable, Align: 8, Size: 8
+
+; <CS GPRs>
+; <CS PPRs>
+; %ppr_local sp+2080+286*vscale (addvl #17, addpl #7)
+; 14 * vscale bytes of padding sp+2080+272*vscale
+; <hazard padding> sp+1056+272*vscale
+; <CS ZPRs> sp+1056+16*vscale
+; %zpr_local sp+1056
+; %fpr_local sp+1048
+; 8 bytes of padding sp+1040
+; <hazard padding> sp+16
+; %gpr_local sp+8
+; 8 bytes of padding sp
+; -> sp
+define void @all_stack_areas(<vscale x 16 x i1> %pred, double %fp) {
+; CHECK-LABEL: all_stack_areas:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-17
+; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1056
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0b, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0xa0, 0x01, 0x1e, 0x22 // sp + 2096 + 160 * VG
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 32 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 40 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d10 @ cfa - 48 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d11 @ cfa - 56 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d12 @ cfa - 64 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d13 @ cfa - 72 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d14 @ cfa - 80 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xa8, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d15 @ cfa - 88 * VG - 1040
+; CHECK-NEXT: add x0, sp, #2080
+; CHECK-NEXT: add x8, sp, #2080
+; CHECK-NEXT: add x1, sp, #1056
+; CHECK-NEXT: addvl x0, x0, #17
+; CHECK-NEXT: add x2, sp, #1048
+; CHECK-NEXT: add x3, sp, #8
+; CHECK-NEXT: addpl x0, x0, #7
+; CHECK-NEXT: str d0, [sp, #1048]
+; CHECK-NEXT: str p0, [x8, #143, mul vl]
+; CHECK-NEXT: bl foo
+; CHECK-NEXT: add sp, sp, #1056
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #17
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ %fpr_local = alloca double
+ ; // Needed to sort %fpr_local into the FPR region
+ store double %fp, ptr %fpr_local
+ ; // Needed to sort %ppr_local into the PPR region
+ store <vscale x 16 x i1> %pred, ptr %ppr_local
+ %gpr_local = alloca i64
+ call void @foo(ptr %ppr_local, ptr %zpr_local, ptr %fpr_local, ptr %gpr_local)
+ ret void
+}
+declare void @foo(ptr, ptr, ptr, ptr)
+
+; CHECK-FRAMELAYOUT-LABEL: Function: all_stack_areas_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-34 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-304 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-320 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1064-320 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2096-320 x vscale], Type: Variable, Align: 16, Size: 1024
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2104-320 x vscale], Type: Variable, Align: 8, Size: 8
+
+; <CS GPRs>
+; -> fp
+; <CS PPRs> fp-32*vscale
+; %ppr_local fp-34*vscale (addpl #-17)
+; 14 * vscale bytes of padding fp-48*vscale
+; <hazard padding> fp-1024-48*vscale
+; <CS ZPRs> fp-1024-304*vscale
+; %zpr_local sp-1024-320*vscale (addvl #-20)
+; %fpr_local sp+1048
+; 8 bytes of padding sp+1040
+; <hazard padding> sp+16
+; %gpr_local sp+8
+; 8 bytes of padding sp
+; -> sp
+define void @all_stack_areas_fp(<vscale x 16 x i1> %pred, double %fp) "frame-pointer"="all" {
+; CHECK-LABEL: all_stack_areas_fp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-17
+; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1056
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w28, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 32 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 40 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 48 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d11 @ cfa - 56 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d12 @ cfa - 64 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d13 @ cfa - 72 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d14 @ cfa - 80 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xa8, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d15 @ cfa - 88 * VG - 1056
+; CHECK-NEXT: sub x1, x29, #1024
+; CHECK-NEXT: addpl x0, x29, #-17
+; CHECK-NEXT: add x2, sp, #1048
+; CHECK-NEXT: addvl x1, x1, #-20
+; CHECK-NEXT: add x3, sp, #8
+; CHECK-NEXT: str d0, [sp, #1048]
+; CHECK-NEXT: str p0, [x29, #-17, mul vl]
+; CHECK-NEXT: bl foo
+; CHECK-NEXT: add sp, sp, #1056
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #17
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ %fpr_local = alloca double
+ ; // Needed to sort %fpr_local into the FPR region
+ store double %fp, ptr %fpr_local
+ ; // Needed to sort %ppr_local into the PPR region
+ store <vscale x 16 x i1> %pred, ptr %ppr_local
+ %gpr_local = alloca i64
+ call void @foo(ptr %ppr_local, ptr %zpr_local, ptr %fpr_local, ptr %gpr_local)
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: svecc_call
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-56], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-48 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2112-288 x vscale], Type: Variable, Align: 16, Size: 1024
+
+define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: svecc_call:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 64
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w26, -16
+; CHECK-NEXT: .cfi_offset w27, -24
+; CHECK-NEXT: .cfi_offset w28, -32
+; CHECK-NEXT: .cfi_offset vg, -48
+; CHECK-NEXT: .cfi_offset w30, -56
+; CHECK-NEXT: .cfi_offset w29, -64
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-16
+; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: mov x19, x0
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: tbz w19, #0, .LBB8_2
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB8_2: // %entry
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: mov w1, #45 // =0x2d
+; CHECK-NEXT: mov w2, #37 // =0x25
+; CHECK-NEXT: bl memset
+; CHECK-NEXT: tbz w19, #0, .LBB8_4
+; CHECK-NEXT: // %bb.3: // %entry
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB8_4: // %entry
+; CHECK-NEXT: mov w0, #22647 // =0x5877
+; CHECK-NEXT: movk w0, #59491, lsl #16
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #16
+; CHECK-NEXT: .cfi_restore z8
+; CHECK-NEXT: .cfi_restore z9
+; CHECK-NEXT: .cfi_restore z10
+; CHECK-NEXT: .cfi_restore z11
+; CHECK-NEXT: .cfi_restore z12
+; CHECK-NEXT: .cfi_restore z13
+; CHECK-NEXT: .cfi_restore z14
+; CHECK-NEXT: .cfi_restore z15
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: .cfi_def_cfa wsp, 64
+; CHECK-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w26
+; CHECK-NEXT: .cfi_restore w27
+; CHECK-NEXT: .cfi_restore w28
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
+entry:
+ tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
+ %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37)
+ ret i32 -396142473
+}
+declare ptr @memset(ptr, i32, i32)
+
+; FIXME: aarch64-split-sve-objects is currently not supported in this function
+; as it requires stack reealignment (for the 32-byte aligned alloca).
+; GPR CSRs
+; <hazard padding>
+; FPR CSRs
+; <hazrd padding>
+; <SVE locals (PPRs and ZPRs)> <--- hazard between PPRs and ZPRs here!
+; <realignment padding>
+; -> sp
+define void @zpr_and_ppr_local_realignment(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: zpr_and_ppr_local_realignment:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #1040
+; CHECK-NEXT: sub x9, sp, #1040
+; CHECK-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #1024
+; CHECK-NEXT: addvl x9, x9, #-2
+; CHECK-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: sub x8, x29, #1024
+; CHECK-NEXT: str p0, [x8, #-1, mul vl]
+; CHECK-NEXT: str z0, [x8, #-2, mul vl]
+; CHECK-NEXT: str x0, [sp]
+; CHECK-NEXT: sub sp, x29, #1024
+; CHECK-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1040
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ %gpr_local = alloca i64, align 32
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+ store volatile i64 %gpr, ptr %gpr_local
+ ret void
+}
+
+define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr)
+; CHECK-LABEL: zpr_and_ppr_local_stack_probing:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: sub sp, sp, #1824
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xb0, 0x16, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2864 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: add x8, sp, #2848
+; CHECK-NEXT: str p0, [x8, #15, mul vl]
+; CHECK-NEXT: add x8, sp, #1824
+; CHECK-NEXT: str z0, [x8]
+; CHECK-NEXT: str x0, [sp]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1824
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" "aarch64_pstate_sm_compatible"
+{
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ %gpr_local = alloca i64, i64 100, align 8
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+ store volatile i64 %gpr, ptr %gpr_local
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index 5f52280..333a8be 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=0 | FileCheck %s --check-prefixes=CHECK,CHECK0
; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=64 | FileCheck %s --check-prefixes=CHECK,CHECK64
-; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-NOSPLITSVE
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-split-sve-objects -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-SPLITSVE
define i32 @basic(i32 noundef %num) {
; CHECK-LABEL: basic:
@@ -1503,72 +1504,24 @@ define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>
}
define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1_caller([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind "aarch64_pstate_sm_compatible" {
-; CHECK0-LABEL: sve_signature_pred_2xv4i1_caller:
-; CHECK0: // %bb.0:
-; CHECK0-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK0-NEXT: addvl sp, sp, #-1
-; CHECK0-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT: mov p5.b, p0.b
-; CHECK0-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT: mov p4.b, p1.b
-; CHECK0-NEXT: mov p0.b, p2.b
-; CHECK0-NEXT: mov p1.b, p3.b
-; CHECK0-NEXT: mov p2.b, p5.b
-; CHECK0-NEXT: mov p3.b, p4.b
-; CHECK0-NEXT: bl sve_signature_pred_2xv4i1
-; CHECK0-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT: addvl sp, sp, #1
-; CHECK0-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK0-NEXT: ret
-;
-; CHECK64-LABEL: sve_signature_pred_2xv4i1_caller:
-; CHECK64: // %bb.0:
-; CHECK64-NEXT: sub sp, sp, #80
-; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT: addvl sp, sp, #-1
-; CHECK64-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT: sub sp, sp, #64
-; CHECK64-NEXT: mov p4.b, p1.b
-; CHECK64-NEXT: mov p5.b, p0.b
-; CHECK64-NEXT: mov p0.b, p2.b
-; CHECK64-NEXT: mov p1.b, p3.b
-; CHECK64-NEXT: mov p2.b, p5.b
-; CHECK64-NEXT: mov p3.b, p4.b
-; CHECK64-NEXT: bl sve_signature_pred_2xv4i1
-; CHECK64-NEXT: add sp, sp, #64
-; CHECK64-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT: addvl sp, sp, #1
-; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT: add sp, sp, #80
-; CHECK64-NEXT: ret
-;
-; CHECK1024-LABEL: sve_signature_pred_2xv4i1_caller:
-; CHECK1024: // %bb.0:
-; CHECK1024-NEXT: sub sp, sp, #1040
-; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT: addvl sp, sp, #-1
-; CHECK1024-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: sub sp, sp, #1024
-; CHECK1024-NEXT: mov p4.b, p1.b
-; CHECK1024-NEXT: mov p5.b, p0.b
-; CHECK1024-NEXT: mov p0.b, p2.b
-; CHECK1024-NEXT: mov p1.b, p3.b
-; CHECK1024-NEXT: mov p2.b, p5.b
-; CHECK1024-NEXT: mov p3.b, p4.b
-; CHECK1024-NEXT: bl sve_signature_pred_2xv4i1
-; CHECK1024-NEXT: add sp, sp, #1024
-; CHECK1024-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: addvl sp, sp, #1
-; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT: add sp, sp, #1040
-; CHECK1024-NEXT: ret
+; CHECK-LABEL: sve_signature_pred_2xv4i1_caller:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p5.b, p0.b
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p4.b, p1.b
+; CHECK-NEXT: mov p0.b, p2.b
+; CHECK-NEXT: mov p1.b, p3.b
+; CHECK-NEXT: mov p2.b, p5.b
+; CHECK-NEXT: mov p3.b, p4.b
+; CHECK-NEXT: bl sve_signature_pred_2xv4i1
+; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
%res = call [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>] %arg2, [2 x <vscale x 4 x i1>] %arg1)
ret [2 x <vscale x 4 x i1>] %res
}
@@ -2113,139 +2066,269 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
; CHECK64-NEXT: .cfi_restore w29
; CHECK64-NEXT: ret
;
-; CHECK1024-LABEL: svecc_call:
-; CHECK1024: // %bb.0: // %entry
-; CHECK1024-NEXT: sub sp, sp, #1088
-; CHECK1024-NEXT: .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT: cntd x9
-; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT: add x29, sp, #1024
-; CHECK1024-NEXT: .cfi_def_cfa w29, 64
-; CHECK1024-NEXT: .cfi_offset w19, -16
-; CHECK1024-NEXT: .cfi_offset w26, -24
-; CHECK1024-NEXT: .cfi_offset w27, -32
-; CHECK1024-NEXT: .cfi_offset w28, -40
-; CHECK1024-NEXT: .cfi_offset vg, -48
-; CHECK1024-NEXT: .cfi_offset w30, -56
-; CHECK1024-NEXT: .cfi_offset w29, -64
-; CHECK1024-NEXT: addvl sp, sp, #-18
-; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
-; CHECK1024-NEXT: sub sp, sp, #1024
-; CHECK1024-NEXT: mov x8, x0
-; CHECK1024-NEXT: bl __arm_sme_state
-; CHECK1024-NEXT: mov x19, x0
-; CHECK1024-NEXT: //APP
-; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: tbz w19, #0, .LBB28_2
-; CHECK1024-NEXT: // %bb.1: // %entry
-; CHECK1024-NEXT: smstop sm
-; CHECK1024-NEXT: .LBB28_2: // %entry
-; CHECK1024-NEXT: mov x0, x8
-; CHECK1024-NEXT: mov w1, #45 // =0x2d
-; CHECK1024-NEXT: mov w2, #37 // =0x25
-; CHECK1024-NEXT: bl memset
-; CHECK1024-NEXT: tbz w19, #0, .LBB28_4
-; CHECK1024-NEXT: // %bb.3: // %entry
-; CHECK1024-NEXT: smstart sm
-; CHECK1024-NEXT: .LBB28_4: // %entry
-; CHECK1024-NEXT: mov w0, #22647 // =0x5877
-; CHECK1024-NEXT: movk w0, #59491, lsl #16
-; CHECK1024-NEXT: add sp, sp, #1024
-; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: addvl sp, sp, #18
-; CHECK1024-NEXT: .cfi_restore z8
-; CHECK1024-NEXT: .cfi_restore z9
-; CHECK1024-NEXT: .cfi_restore z10
-; CHECK1024-NEXT: .cfi_restore z11
-; CHECK1024-NEXT: .cfi_restore z12
-; CHECK1024-NEXT: .cfi_restore z13
-; CHECK1024-NEXT: .cfi_restore z14
-; CHECK1024-NEXT: .cfi_restore z15
-; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT: add sp, sp, #1088
-; CHECK1024-NEXT: .cfi_def_cfa_offset 0
-; CHECK1024-NEXT: .cfi_restore w19
-; CHECK1024-NEXT: .cfi_restore w26
-; CHECK1024-NEXT: .cfi_restore w27
-; CHECK1024-NEXT: .cfi_restore w28
-; CHECK1024-NEXT: .cfi_restore vg
-; CHECK1024-NEXT: .cfi_restore w30
-; CHECK1024-NEXT: .cfi_restore w29
-; CHECK1024-NEXT: ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_call:
+; CHECK1024-NOSPLITSVE: // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088
+; CHECK1024-NOSPLITSVE-NEXT: cntd x9
+; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: mov x8, x0
+; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state
+; CHECK1024-NOSPLITSVE-NEXT: mov x19, x0
+; CHECK1024-NOSPLITSVE-NEXT: //APP
+; CHECK1024-NOSPLITSVE-NEXT: //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB28_2
+; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: smstop sm
+; CHECK1024-NOSPLITSVE-NEXT: .LBB28_2: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: mov x0, x8
+; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d
+; CHECK1024-NOSPLITSVE-NEXT: mov w2, #37 // =0x25
+; CHECK1024-NOSPLITSVE-NEXT: bl memset
+; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB28_4
+; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: smstart sm
+; CHECK1024-NOSPLITSVE-NEXT: .LBB28_4: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #18
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088
+; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29
+; CHECK1024-NOSPLITSVE-NEXT: ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_call:
+; CHECK1024-SPLITSVE: // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64
+; CHECK1024-SPLITSVE-NEXT: cntd x9
+; CHECK1024-SPLITSVE-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: mov x29, sp
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -16
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -24
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -32
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: mov x8, x0
+; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state
+; CHECK1024-SPLITSVE-NEXT: mov x19, x0
+; CHECK1024-SPLITSVE-NEXT: //APP
+; CHECK1024-SPLITSVE-NEXT: //NO_APP
+; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB28_2
+; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry
+; CHECK1024-SPLITSVE-NEXT: smstop sm
+; CHECK1024-SPLITSVE-NEXT: .LBB28_2: // %entry
+; CHECK1024-SPLITSVE-NEXT: mov x0, x8
+; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d
+; CHECK1024-SPLITSVE-NEXT: mov w2, #37 // =0x25
+; CHECK1024-SPLITSVE-NEXT: bl memset
+; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB28_4
+; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry
+; CHECK1024-SPLITSVE-NEXT: smstart sm
+; CHECK1024-SPLITSVE-NEXT: .LBB28_4: // %entry
+; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #16
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15
+; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #2
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64
+; CHECK1024-SPLITSVE-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29
+; CHECK1024-SPLITSVE-NEXT: ret
entry:
tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
%call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37)
@@ -2505,138 +2588,267 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
; CHECK64-NEXT: .cfi_restore w29
; CHECK64-NEXT: ret
;
-; CHECK1024-LABEL: svecc_alloca_call:
-; CHECK1024: // %bb.0: // %entry
-; CHECK1024-NEXT: sub sp, sp, #1088
-; CHECK1024-NEXT: .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT: cntd x9
-; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT: add x29, sp, #1024
-; CHECK1024-NEXT: .cfi_def_cfa w29, 64
-; CHECK1024-NEXT: .cfi_offset w19, -16
-; CHECK1024-NEXT: .cfi_offset w26, -24
-; CHECK1024-NEXT: .cfi_offset w27, -32
-; CHECK1024-NEXT: .cfi_offset w28, -40
-; CHECK1024-NEXT: .cfi_offset vg, -48
-; CHECK1024-NEXT: .cfi_offset w30, -56
-; CHECK1024-NEXT: .cfi_offset w29, -64
-; CHECK1024-NEXT: addvl sp, sp, #-18
-; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
-; CHECK1024-NEXT: sub sp, sp, #1072
-; CHECK1024-NEXT: bl __arm_sme_state
-; CHECK1024-NEXT: mov x19, x0
-; CHECK1024-NEXT: //APP
-; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: tbz w19, #0, .LBB29_2
-; CHECK1024-NEXT: // %bb.1: // %entry
-; CHECK1024-NEXT: smstop sm
-; CHECK1024-NEXT: .LBB29_2: // %entry
-; CHECK1024-NEXT: mov x0, sp
-; CHECK1024-NEXT: mov w1, #45 // =0x2d
-; CHECK1024-NEXT: mov w2, #37 // =0x25
-; CHECK1024-NEXT: bl memset
-; CHECK1024-NEXT: tbz w19, #0, .LBB29_4
-; CHECK1024-NEXT: // %bb.3: // %entry
-; CHECK1024-NEXT: smstart sm
-; CHECK1024-NEXT: .LBB29_4: // %entry
-; CHECK1024-NEXT: mov w0, #22647 // =0x5877
-; CHECK1024-NEXT: movk w0, #59491, lsl #16
-; CHECK1024-NEXT: add sp, sp, #1072
-; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: addvl sp, sp, #18
-; CHECK1024-NEXT: .cfi_restore z8
-; CHECK1024-NEXT: .cfi_restore z9
-; CHECK1024-NEXT: .cfi_restore z10
-; CHECK1024-NEXT: .cfi_restore z11
-; CHECK1024-NEXT: .cfi_restore z12
-; CHECK1024-NEXT: .cfi_restore z13
-; CHECK1024-NEXT: .cfi_restore z14
-; CHECK1024-NEXT: .cfi_restore z15
-; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT: add sp, sp, #1088
-; CHECK1024-NEXT: .cfi_def_cfa_offset 0
-; CHECK1024-NEXT: .cfi_restore w19
-; CHECK1024-NEXT: .cfi_restore w26
-; CHECK1024-NEXT: .cfi_restore w27
-; CHECK1024-NEXT: .cfi_restore w28
-; CHECK1024-NEXT: .cfi_restore vg
-; CHECK1024-NEXT: .cfi_restore w30
-; CHECK1024-NEXT: .cfi_restore w29
-; CHECK1024-NEXT: ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_alloca_call:
+; CHECK1024-NOSPLITSVE: // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088
+; CHECK1024-NOSPLITSVE-NEXT: cntd x9
+; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1072
+; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state
+; CHECK1024-NOSPLITSVE-NEXT: mov x19, x0
+; CHECK1024-NOSPLITSVE-NEXT: //APP
+; CHECK1024-NOSPLITSVE-NEXT: //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB29_2
+; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: smstop sm
+; CHECK1024-NOSPLITSVE-NEXT: .LBB29_2: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: mov x0, sp
+; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d
+; CHECK1024-NOSPLITSVE-NEXT: mov w2, #37 // =0x25
+; CHECK1024-NOSPLITSVE-NEXT: bl memset
+; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB29_4
+; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: smstart sm
+; CHECK1024-NOSPLITSVE-NEXT: .LBB29_4: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1072
+; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #18
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088
+; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29
+; CHECK1024-NOSPLITSVE-NEXT: ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_alloca_call:
+; CHECK1024-SPLITSVE: // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64
+; CHECK1024-SPLITSVE-NEXT: cntd x9
+; CHECK1024-SPLITSVE-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: mov x29, sp
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -16
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -24
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -32
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1072
+; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state
+; CHECK1024-SPLITSVE-NEXT: mov x19, x0
+; CHECK1024-SPLITSVE-NEXT: //APP
+; CHECK1024-SPLITSVE-NEXT: //NO_APP
+; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB29_2
+; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry
+; CHECK1024-SPLITSVE-NEXT: smstop sm
+; CHECK1024-SPLITSVE-NEXT: .LBB29_2: // %entry
+; CHECK1024-SPLITSVE-NEXT: mov x0, sp
+; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d
+; CHECK1024-SPLITSVE-NEXT: mov w2, #37 // =0x25
+; CHECK1024-SPLITSVE-NEXT: bl memset
+; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB29_4
+; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry
+; CHECK1024-SPLITSVE-NEXT: smstart sm
+; CHECK1024-SPLITSVE-NEXT: .LBB29_4: // %entry
+; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1072
+; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #16
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15
+; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #2
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64
+; CHECK1024-SPLITSVE-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29
+; CHECK1024-SPLITSVE-NEXT: ret
entry:
tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
index 7bddd1d..cc63c7f 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
@@ -56,9 +56,9 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_many_svepred_arg(<
; CHECK: name: caller_with_many_svepred_arg
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 2, alignment: 2,
-; CHECK-NEXT: stack-id: scalable-vector
+; CHECK-NEXT: stack-id: scalable-predicate-vector
; CHECK: - { id: 1, name: '', type: default, offset: 0, size: 2, alignment: 2,
-; CHECK-NEXT: stack-id: scalable-vector
+; CHECK-NEXT: stack-id: scalable-predicate-vector
; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.0, 0
; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.1, 0
; CHECK-DAG: [[BASE1:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0
@@ -90,7 +90,7 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_svepred_arg_1xv16i
; CHECK: name: caller_with_svepred_arg_1xv16i1_4xv16i1
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 2, alignment: 2,
-; CHECK-NEXT: stack-id: scalable-vector,
+; CHECK-NEXT: stack-id: scalable-predicate-vector,
; CHECK: [[PRED0:%[0-9]+]]:ppr = COPY $p0
; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
; CHECK: STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
@@ -139,7 +139,7 @@ define [4 x <vscale x 16 x i1>] @caller_with_svepred_arg_4xv16i1_4xv16i1([4 x <v
; CHECK: name: caller_with_svepred_arg_4xv16i1_4xv16i1
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
-; CHECK-NEXT: stack-id: scalable-vector,
+; CHECK-NEXT: stack-id: scalable-predicate-vector,
; CHECK: [[PRED3:%[0-9]+]]:ppr = COPY $p3
; CHECK: [[PRED2:%[0-9]+]]:ppr = COPY $p2
; CHECK: [[PRED1:%[0-9]+]]:ppr = COPY $p1
@@ -200,7 +200,7 @@ define [2 x <vscale x 32 x i1>] @caller_with_svepred_arg_2xv32i1_1xv16i1([2 x <v
; CHECK: name: caller_with_svepred_arg_2xv32i1_1xv16i1
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
-; CHECK-NEXT: stack-id: scalable-vector,
+; CHECK-NEXT: stack-id: scalable-predicate-vector,
; CHECK: [[PRED3:%[0-9]+]]:ppr = COPY $p3
; CHECK: [[PRED2:%[0-9]+]]:ppr = COPY $p2
; CHECK: [[PRED1:%[0-9]+]]:ppr = COPY $p1
diff --git a/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll b/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll
new file mode 100644
index 0000000..584753b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll
@@ -0,0 +1,2854 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mattr=+sve < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @sve_load_store_nxv1i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i8>, ptr %a
+ store <vscale x 1 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i8>, ptr %a
+ store <vscale x 2 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i8>, ptr %a
+ store <vscale x 3 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i8>, ptr %a
+ store <vscale x 4 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i8>, ptr %a
+ store <vscale x 5 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1b { z1.s }, p1, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1b { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i8>, ptr %a
+ store <vscale x 6 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i8>, ptr %a
+ store <vscale x 7 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i8>, ptr %a
+ store <vscale x 8 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv9i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x i8>, ptr %a
+ store <vscale x 9 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv10i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: st1b { z0.h }, p1, [x1]
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1b { z1.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x i8>, ptr %a
+ store <vscale x 10 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv11i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x i8>, ptr %a
+ store <vscale x 11 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv12i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: st1b { z0.h }, p1, [x1]
+; CHECK-NEXT: st1b { z1.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x i8>, ptr %a
+ store <vscale x 12 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv13i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x i8>, ptr %a
+ store <vscale x 13 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv14i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ptrue p2.h
+; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: ld1b { z1.h }, p2/z, [x0]
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: st1b { z0.h }, p2, [x1]
+; CHECK-NEXT: uunpklo z2.d, z2.s
+; CHECK-NEXT: st1b { z1.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: st1b { z2.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x i8>, ptr %a
+ store <vscale x 14 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv15i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x i8>, ptr %a
+ store <vscale x 15 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv16i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x i8>, ptr %a
+ store <vscale x 16 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv17i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv17i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #17 // =0x11
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 17 x i8>, ptr %a
+ store <vscale x 17 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv18i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv18i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x8]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpklo z2.s, z1.h
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEXT: uzp1 z2.s, z0.s, z2.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpklo z2.s, z1.h
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uunpklo z2.d, z2.s
+; CHECK-NEXT: uzp1 z2.s, z2.s, z0.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEXT: uzp1 z2.s, z0.s, z2.s
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z2.d, z2.s
+; CHECK-NEXT: uzp1 z2.s, z2.s, z0.s
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1b { z0.d }, p0, [x1, x8]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 18 x i8>, ptr %a
+ store <vscale x 18 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv19i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv19i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #19 // =0x13
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 19 x i8>, ptr %a
+ store <vscale x 19 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv20i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv20i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1b { z1.s }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uzp1 z1.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: st1b { z0.s }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 20 x i8>, ptr %a
+ store <vscale x 20 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv21i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv21i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #21 // =0x15
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 21 x i8>, ptr %a
+ store <vscale x 21 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv22i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv22i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: cntw x8, all, mul #5
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: ld1b { z1.d }, p1/z, [x0, x8]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uzp1 z1.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1b { z1.d }, p1, [x1, x8]
+; CHECK-NEXT: st1b { z0.s }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 22 x i8>, ptr %a
+ store <vscale x 22 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv23i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv23i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #23 // =0x17
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 23 x i8>, ptr %a
+ store <vscale x 23 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv24i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv24i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: st1b { z0.h }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 24 x i8>, ptr %a
+ store <vscale x 24 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv25i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv25i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #25 // =0x19
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 25 x i8>, ptr %a
+ store <vscale x 25 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv26i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv26i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: cnth x8, all, mul #3
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x8]
+; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1b { z1.d }, p0, [x1, x8]
+; CHECK-NEXT: st1b { z0.h }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 26 x i8>, ptr %a
+ store <vscale x 26 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv27i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv27i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #27 // =0x1b
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 27 x i8>, ptr %a
+ store <vscale x 27 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv28i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv28i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: st1b { z0.h }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: st1b { z1.s }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 28 x i8>, ptr %a
+ store <vscale x 28 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv29i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv29i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #29 // =0x1d
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 29 x i8>, ptr %a
+ store <vscale x 29 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv30i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv30i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: cntw x8, all, mul #7
+; CHECK-NEXT: ldr z3, [x0]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x8]
+; CHECK-NEXT: ptrue p2.h
+; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1b { z2.h }, p2/z, [x0, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z2.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z2.d, z2.s
+; CHECK-NEXT: st1b { z2.d }, p0, [x1, x8]
+; CHECK-NEXT: st1b { z0.h }, p2, [x1, #2, mul vl]
+; CHECK-NEXT: st1b { z1.s }, p1, [x1, #6, mul vl]
+; CHECK-NEXT: str z3, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 30 x i8>, ptr %a
+ store <vscale x 30 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv31i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv31i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #31 // =0x1f
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 31 x i8>, ptr %a
+ store <vscale x 31 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv32i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv32i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 32 x i8>, ptr %a
+ store <vscale x 32 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i16>, ptr %a
+ store <vscale x 1 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i16>, ptr %a
+ store <vscale x 2 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i16>, ptr %a
+ store <vscale x 3 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i16>, ptr %a
+ store <vscale x 4 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i16>, ptr %a
+ store <vscale x 5 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1h { z0.s }, p1, [x1]
+; CHECK-NEXT: st1h { z1.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i16>, ptr %a
+ store <vscale x 6 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i16>, ptr %a
+ store <vscale x 7 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i16>, ptr %a
+ store <vscale x 8 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv9i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x i16>, ptr %a
+ store <vscale x 9 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv10i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z1.h, z0.h, z0.h
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z1.h, z0.h, z1.h
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x i16>, ptr %a
+ store <vscale x 10 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv11i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x i16>, ptr %a
+ store <vscale x 11 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv12i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: st1h { z0.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x i16>, ptr %a
+ store <vscale x 12 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv13i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x i16>, ptr %a
+ store <vscale x 13 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv14i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1h { z0.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: st1h { z1.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x i16>, ptr %a
+ store <vscale x 14 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv15i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x i16>, ptr %a
+ store <vscale x 15 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv16i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x i16>, ptr %a
+ store <vscale x 16 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i32>, ptr %a
+ store <vscale x 1 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i32>, ptr %a
+ store <vscale x 2 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i32>, ptr %a
+ store <vscale x 3 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i32>, ptr %a
+ store <vscale x 4 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i32>, ptr %a
+ store <vscale x 5 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1w { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i32>, ptr %a
+ store <vscale x 6 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i32>, ptr %a
+ store <vscale x 7 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i32>, ptr %a
+ store <vscale x 8 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i64>, ptr %a
+ store <vscale x 1 x i64> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i64>, ptr %a
+ store <vscale x 2 x i64> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1d { z1.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i64>, ptr %a
+ store <vscale x 3 x i64> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i64>, ptr %a
+ store <vscale x 4 x i64> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x half>, ptr %a
+ store <vscale x 1 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x half>, ptr %a
+ store <vscale x 2 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x half>, ptr %a
+ store <vscale x 3 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x half>, ptr %a
+ store <vscale x 4 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x half>, ptr %a
+ store <vscale x 5 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1h { z1.s }, p1, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x half>, ptr %a
+ store <vscale x 6 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x half>, ptr %a
+ store <vscale x 7 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x half>, ptr %a
+ store <vscale x 8 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv9f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x half>, ptr %a
+ store <vscale x 9 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv10f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: st1h { z1.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x half>, ptr %a
+ store <vscale x 10 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv11f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x half>, ptr %a
+ store <vscale x 11 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv12f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: st1h { z1.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x half>, ptr %a
+ store <vscale x 12 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv13f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x half>, ptr %a
+ store <vscale x 13 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv14f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1h { z1.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x half>, ptr %a
+ store <vscale x 14 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv15f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x half>, ptr %a
+ store <vscale x 15 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv16f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x half>, ptr %a
+ store <vscale x 16 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x float>, ptr %a
+ store <vscale x 1 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x float>, ptr %a
+ store <vscale x 2 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x float>, ptr %a
+ store <vscale x 3 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x float>, ptr %a
+ store <vscale x 4 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x float>, ptr %a
+ store <vscale x 5 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: st1w { z1.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x float>, ptr %a
+ store <vscale x 6 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x float>, ptr %a
+ store <vscale x 7 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x float>, ptr %a
+ store <vscale x 8 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1f64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x double>, ptr %a
+ store <vscale x 1 x double> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2f64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x double>, ptr %a
+ store <vscale x 2 x double> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3f64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1d { z1.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x double>, ptr %a
+ store <vscale x 3 x double> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4f64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x double>, ptr %a
+ store <vscale x 4 x double> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x bfloat>, ptr %a
+ store <vscale x 1 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x bfloat>, ptr %a
+ store <vscale x 2 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x bfloat>, ptr %a
+ store <vscale x 3 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x bfloat>, ptr %a
+ store <vscale x 4 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x bfloat>, ptr %a
+ store <vscale x 5 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1h { z1.s }, p1, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x bfloat>, ptr %a
+ store <vscale x 6 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x bfloat>, ptr %a
+ store <vscale x 7 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x bfloat>, ptr %a
+ store <vscale x 8 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv9bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x bfloat>, ptr %a
+ store <vscale x 9 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv10bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: st1h { z1.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x bfloat>, ptr %a
+ store <vscale x 10 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv11bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x bfloat>, ptr %a
+ store <vscale x 11 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv12bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: st1h { z1.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x bfloat>, ptr %a
+ store <vscale x 12 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv13bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x bfloat>, ptr %a
+ store <vscale x 13 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv14bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1h { z1.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x bfloat>, ptr %a
+ store <vscale x 14 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv15bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x bfloat>, ptr %a
+ store <vscale x 15 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv16bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x bfloat>, ptr %a
+ store <vscale x 16 x bfloat> %c, ptr %b
+ ret void
+}
+
+define <vscale x 1 x i16> @sve_sextload_nxv1i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv1i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i8>, ptr %a
+ %c.sext = sext <vscale x 1 x i8> %c to <vscale x 1 x i16>
+ ret <vscale x 1 x i16> %c.sext
+}
+
+define <vscale x 2 x i16> @sve_sextload_nxv2i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i8>, ptr %a
+ %c.sext = sext <vscale x 2 x i8> %c to <vscale x 2 x i16>
+ ret <vscale x 2 x i16> %c.sext
+}
+
+define <vscale x 3 x i16> @sve_sextload_nxv3i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv3i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i8>, ptr %a
+ %c.sext = sext <vscale x 3 x i8> %c to <vscale x 3 x i16>
+ ret <vscale x 3 x i16> %c.sext
+}
+
+define <vscale x 4 x i16> @sve_sextload_nxv4i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i8>, ptr %a
+ %c.sext = sext <vscale x 4 x i8> %c to <vscale x 4 x i16>
+ ret <vscale x 4 x i16> %c.sext
+}
+
+define <vscale x 5 x i16> @sve_sextload_nxv5i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv5i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i8>, ptr %a
+ %c.sext = sext <vscale x 5 x i8> %c to <vscale x 5 x i16>
+ ret <vscale x 5 x i16> %c.sext
+}
+
+define <vscale x 6 x i16> @sve_sextload_nxv6i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv6i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x8, all, mul #3
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i8>, ptr %a
+ %c.sext = sext <vscale x 6 x i8> %c to <vscale x 6 x i16>
+ ret <vscale x 6 x i16> %c.sext
+}
+
+define <vscale x 7 x i16> @sve_sextload_nxv7i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv7i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i8>, ptr %a
+ %c.sext = sext <vscale x 7 x i8> %c to <vscale x 7 x i16>
+ ret <vscale x 7 x i16> %c.sext
+}
+
+define <vscale x 8 x i16> @sve_sextload_nxv8i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i8>, ptr %a
+ %c.sext = sext <vscale x 8 x i8> %c to <vscale x 8 x i16>
+ ret <vscale x 8 x i16> %c.sext
+}
+
+define <vscale x 9 x i16> @sve_sextload_nxv9i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv9i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x i8>, ptr %a
+ %c.sext = sext <vscale x 9 x i8> %c to <vscale x 9 x i16>
+ ret <vscale x 9 x i16> %c.sext
+}
+
+define <vscale x 10 x i16> @sve_sextload_nxv10i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv10i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #5
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [sp, #4, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x i8>, ptr %a
+ %c.sext = sext <vscale x 10 x i8> %c to <vscale x 10 x i16>
+ ret <vscale x 10 x i16> %c.sext
+}
+
+define <vscale x 11 x i16> @sve_sextload_nxv11i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv11i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x i8>, ptr %a
+ %c.sext = sext <vscale x 11 x i8> %c to <vscale x 11 x i16>
+ ret <vscale x 11 x i16> %c.sext
+}
+
+define <vscale x 12 x i16> @sve_sextload_nxv12i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv12i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x i8>, ptr %a
+ %c.sext = sext <vscale x 12 x i8> %c to <vscale x 12 x i16>
+ ret <vscale x 12 x i16> %c.sext
+}
+
+define <vscale x 13 x i16> @sve_sextload_nxv13i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv13i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x i8>, ptr %a
+ %c.sext = sext <vscale x 13 x i8> %c to <vscale x 13 x i16>
+ ret <vscale x 13 x i16> %c.sext
+}
+
+define <vscale x 14 x i16> @sve_sextload_nxv14i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv14i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #7
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z2, [sp]
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: st1h { z1.d }, p0, [sp, #6, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x i8>, ptr %a
+ %c.sext = sext <vscale x 14 x i8> %c to <vscale x 14 x i16>
+ ret <vscale x 14 x i16> %c.sext
+}
+
+define <vscale x 15 x i16> @sve_sextload_nxv15i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv15i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x i8>, ptr %a
+ %c.sext = sext <vscale x 15 x i8> %c to <vscale x 15 x i16>
+ ret <vscale x 15 x i16> %c.sext
+}
+
+define <vscale x 16 x i16> @sve_sextload_nxv16i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x i8>, ptr %a
+ %c.sext = sext <vscale x 16 x i8> %c to <vscale x 16 x i16>
+ ret <vscale x 16 x i16> %c.sext
+}
+
+define <vscale x 1 x i32> @sve_sextload_nxv1i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv1i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i16>, ptr %a
+ %c.sext = sext <vscale x 1 x i16> %c to <vscale x 1 x i32>
+ ret <vscale x 1 x i32> %c.sext
+}
+
+define <vscale x 2 x i32> @sve_sextload_nxv2i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i16>, ptr %a
+ %c.sext = sext <vscale x 2 x i16> %c to <vscale x 2 x i32>
+ ret <vscale x 2 x i32> %c.sext
+}
+
+define <vscale x 3 x i32> @sve_sextload_nxv3i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv3i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i16>, ptr %a
+ %c.sext = sext <vscale x 3 x i16> %c to <vscale x 3 x i32>
+ ret <vscale x 3 x i32> %c.sext
+}
+
+define <vscale x 4 x i32> @sve_sextload_nxv4i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i16>, ptr %a
+ %c.sext = sext <vscale x 4 x i16> %c to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %c.sext
+}
+
+define <vscale x 5 x i32> @sve_sextload_nxv5i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv5i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i16>, ptr %a
+ %c.sext = sext <vscale x 5 x i16> %c to <vscale x 5 x i32>
+ ret <vscale x 5 x i32> %c.sext
+}
+
+define <vscale x 6 x i32> @sve_sextload_nxv6i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv6i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #3
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: st1w { z0.d }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i16>, ptr %a
+ %c.sext = sext <vscale x 6 x i16> %c to <vscale x 6 x i32>
+ ret <vscale x 6 x i32> %c.sext
+}
+
+define <vscale x 7 x i32> @sve_sextload_nxv7i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv7i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i16>, ptr %a
+ %c.sext = sext <vscale x 7 x i16> %c to <vscale x 7 x i32>
+ ret <vscale x 7 x i32> %c.sext
+}
+
+define <vscale x 8 x i32> @sve_sextload_nxv8i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i16>, ptr %a
+ %c.sext = sext <vscale x 8 x i16> %c to <vscale x 8 x i32>
+ ret <vscale x 8 x i32> %c.sext
+}
+
+define <vscale x 1 x i64> @sve_sextload_nxv1i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv1i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i32>, ptr %a
+ %c.sext = sext <vscale x 1 x i32> %c to <vscale x 1 x i64>
+ ret <vscale x 1 x i64> %c.sext
+}
+
+define <vscale x 2 x i64> @sve_sextload_nxv2i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i32>, ptr %a
+ %c.sext = sext <vscale x 2 x i32> %c to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %c.sext
+}
+
+define <vscale x 3 x i64> @sve_sextload_nxv3i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv3i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1d { z1.d }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i32>, ptr %a
+ %c.sext = sext <vscale x 3 x i32> %c to <vscale x 3 x i64>
+ ret <vscale x 3 x i64> %c.sext
+}
+
+define <vscale x 4 x i64> @sve_sextload_nxv4i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i32>, ptr %a
+ %c.sext = sext <vscale x 4 x i32> %c to <vscale x 4 x i64>
+ ret <vscale x 4 x i64> %c.sext
+}
+
+define <vscale x 1 x i16> @sve_zextload_nxv1i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv1i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i8>, ptr %a
+ %c.zext = sext <vscale x 1 x i8> %c to <vscale x 1 x i16>
+ ret <vscale x 1 x i16> %c.zext
+}
+
+define <vscale x 2 x i16> @sve_zextload_nxv2i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i8>, ptr %a
+ %c.zext = sext <vscale x 2 x i8> %c to <vscale x 2 x i16>
+ ret <vscale x 2 x i16> %c.zext
+}
+
+define <vscale x 3 x i16> @sve_zextload_nxv3i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv3i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i8>, ptr %a
+ %c.zext = sext <vscale x 3 x i8> %c to <vscale x 3 x i16>
+ ret <vscale x 3 x i16> %c.zext
+}
+
+define <vscale x 4 x i16> @sve_zextload_nxv4i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i8>, ptr %a
+ %c.zext = sext <vscale x 4 x i8> %c to <vscale x 4 x i16>
+ ret <vscale x 4 x i16> %c.zext
+}
+
+define <vscale x 5 x i16> @sve_zextload_nxv5i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv5i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i8>, ptr %a
+ %c.zext = sext <vscale x 5 x i8> %c to <vscale x 5 x i16>
+ ret <vscale x 5 x i16> %c.zext
+}
+
+define <vscale x 6 x i16> @sve_zextload_nxv6i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv6i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x8, all, mul #3
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i8>, ptr %a
+ %c.zext = sext <vscale x 6 x i8> %c to <vscale x 6 x i16>
+ ret <vscale x 6 x i16> %c.zext
+}
+
+define <vscale x 7 x i16> @sve_zextload_nxv7i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv7i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i8>, ptr %a
+ %c.zext = sext <vscale x 7 x i8> %c to <vscale x 7 x i16>
+ ret <vscale x 7 x i16> %c.zext
+}
+
+define <vscale x 8 x i16> @sve_zextload_nxv8i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i8>, ptr %a
+ %c.zext = sext <vscale x 8 x i8> %c to <vscale x 8 x i16>
+ ret <vscale x 8 x i16> %c.zext
+}
+
+define <vscale x 9 x i16> @sve_zextload_nxv9i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv9i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x i8>, ptr %a
+ %c.zext = sext <vscale x 9 x i8> %c to <vscale x 9 x i16>
+ ret <vscale x 9 x i16> %c.zext
+}
+
+define <vscale x 10 x i16> @sve_zextload_nxv10i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv10i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #5
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [sp, #4, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x i8>, ptr %a
+ %c.zext = sext <vscale x 10 x i8> %c to <vscale x 10 x i16>
+ ret <vscale x 10 x i16> %c.zext
+}
+
+define <vscale x 11 x i16> @sve_zextload_nxv11i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv11i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x i8>, ptr %a
+ %c.zext = sext <vscale x 11 x i8> %c to <vscale x 11 x i16>
+ ret <vscale x 11 x i16> %c.zext
+}
+
+define <vscale x 12 x i16> @sve_zextload_nxv12i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv12i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x i8>, ptr %a
+ %c.zext = sext <vscale x 12 x i8> %c to <vscale x 12 x i16>
+ ret <vscale x 12 x i16> %c.zext
+}
+
+define <vscale x 13 x i16> @sve_zextload_nxv13i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv13i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x i8>, ptr %a
+ %c.zext = sext <vscale x 13 x i8> %c to <vscale x 13 x i16>
+ ret <vscale x 13 x i16> %c.zext
+}
+
+define <vscale x 14 x i16> @sve_zextload_nxv14i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv14i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #7
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z2, [sp]
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: st1h { z1.d }, p0, [sp, #6, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x i8>, ptr %a
+ %c.zext = sext <vscale x 14 x i8> %c to <vscale x 14 x i16>
+ ret <vscale x 14 x i16> %c.zext
+}
+
+define <vscale x 15 x i16> @sve_zextload_nxv15i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv15i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x i8>, ptr %a
+ %c.zext = sext <vscale x 15 x i8> %c to <vscale x 15 x i16>
+ ret <vscale x 15 x i16> %c.zext
+}
+
+define <vscale x 16 x i16> @sve_zextload_nxv16i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x i8>, ptr %a
+ %c.zext = sext <vscale x 16 x i8> %c to <vscale x 16 x i16>
+ ret <vscale x 16 x i16> %c.zext
+}
+
+define <vscale x 1 x i32> @sve_zextload_nxv1i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv1i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i16>, ptr %a
+ %c.zext = sext <vscale x 1 x i16> %c to <vscale x 1 x i32>
+ ret <vscale x 1 x i32> %c.zext
+}
+
+define <vscale x 2 x i32> @sve_zextload_nxv2i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i16>, ptr %a
+ %c.zext = sext <vscale x 2 x i16> %c to <vscale x 2 x i32>
+ ret <vscale x 2 x i32> %c.zext
+}
+
+define <vscale x 3 x i32> @sve_zextload_nxv3i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv3i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i16>, ptr %a
+ %c.zext = sext <vscale x 3 x i16> %c to <vscale x 3 x i32>
+ ret <vscale x 3 x i32> %c.zext
+}
+
+define <vscale x 4 x i32> @sve_zextload_nxv4i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i16>, ptr %a
+ %c.zext = sext <vscale x 4 x i16> %c to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %c.zext
+}
+
+define <vscale x 5 x i32> @sve_zextload_nxv5i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv5i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i16>, ptr %a
+ %c.zext = sext <vscale x 5 x i16> %c to <vscale x 5 x i32>
+ ret <vscale x 5 x i32> %c.zext
+}
+
+define <vscale x 6 x i32> @sve_zextload_nxv6i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv6i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #3
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: st1w { z0.d }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i16>, ptr %a
+ %c.zext = sext <vscale x 6 x i16> %c to <vscale x 6 x i32>
+ ret <vscale x 6 x i32> %c.zext
+}
+
+define <vscale x 7 x i32> @sve_zextload_nxv7i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv7i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i16>, ptr %a
+ %c.zext = sext <vscale x 7 x i16> %c to <vscale x 7 x i32>
+ ret <vscale x 7 x i32> %c.zext
+}
+
+define <vscale x 8 x i32> @sve_zextload_nxv8i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i16>, ptr %a
+ %c.zext = sext <vscale x 8 x i16> %c to <vscale x 8 x i32>
+ ret <vscale x 8 x i32> %c.zext
+}
+
+define <vscale x 1 x i64> @sve_zextload_nxv1i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv1i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i32>, ptr %a
+ %c.zext = sext <vscale x 1 x i32> %c to <vscale x 1 x i64>
+ ret <vscale x 1 x i64> %c.zext
+}
+
+define <vscale x 2 x i64> @sve_zextload_nxv2i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i32>, ptr %a
+ %c.zext = sext <vscale x 2 x i32> %c to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %c.zext
+}
+
+define <vscale x 3 x i64> @sve_zextload_nxv3i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv3i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1d { z1.d }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i32>, ptr %a
+ %c.zext = sext <vscale x 3 x i32> %c to <vscale x 3 x i64>
+ ret <vscale x 3 x i64> %c.zext
+}
+
+define <vscale x 4 x i64> @sve_zextload_nxv4i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i32>, ptr %a
+ %c.zext = sext <vscale x 4 x i32> %c to <vscale x 4 x i64>
+ ret <vscale x 4 x i64> %c.zext
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
index 2cbb29e..d8de12c 100644
--- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
@@ -672,5 +672,3 @@ entry:
ret i32 %x
}
declare void @other()
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-FRAMELAYOUT: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
index 029aa39..ce1ea4d 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
@@ -128,13 +128,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128 = COPY %0
%2.sub2_sub3:areg_128 = COPY %1
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
SI_RETURN
...
@@ -153,13 +153,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128_align2 = COPY %0
%2.sub2_sub3:areg_128_align2 = COPY %1
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
SI_RETURN
...
@@ -398,14 +398,14 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
undef %1.sub0:areg_128 = COPY %0
%1.sub1:areg_128 = COPY %0
%1.sub2:areg_128 = COPY %0
%1.sub3:areg_128 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %1
SI_RETURN
...
@@ -425,14 +425,14 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
undef %1.sub0:areg_128_align2 = COPY %0
%1.sub1:areg_128_align2 = COPY %0
%1.sub2:areg_128_align2 = COPY %0
%1.sub3:areg_128_align2 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %1
SI_RETURN
...
@@ -641,13 +641,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
%0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
%2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
SI_RETURN
...
@@ -668,13 +668,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1
%0.sub1:vreg_128 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0
%2.sub2_sub3:areg_128_align2 = COPY %0.sub1
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
SI_RETURN
...
@@ -890,14 +890,14 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_64 = COPY $vgpr0
undef %1.sub0:areg_128 = COPY %0.sub0
%1.sub1:areg_128 = COPY %0.sub0
%1.sub2:areg_128 = COPY %0.sub0
%1.sub3:areg_128 = COPY %0.sub0
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %1
SI_RETURN
...
@@ -917,14 +917,14 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_64 = COPY $vgpr0
undef %1.sub0:areg_128_align2 = COPY %0.sub0
%1.sub1:areg_128_align2 = COPY %0.sub0
%1.sub2:areg_128_align2 = COPY %0.sub0
%1.sub3:areg_128_align2 = COPY %0.sub0
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %1
SI_RETURN
...
@@ -1051,13 +1051,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
%0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
%2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
SI_RETURN
...
@@ -1076,13 +1076,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1
%0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1
%2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
SI_RETURN
...
@@ -1358,11 +1358,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%2:areg_128 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
SI_RETURN
...
@@ -1379,11 +1379,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%2:areg_128_align2 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
SI_RETURN
...
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
index 92836d8..63db24a 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
@@ -486,7 +486,7 @@ body: |
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
- ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK-NEXT: S_ENDPGM 0
bb.0:
S_NOP 0, implicit-def $agpr0
@@ -516,7 +516,7 @@ body: |
S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
- INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
+ INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
S_ENDPGM 0
...
@@ -1368,7 +1368,7 @@ body: |
; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
@@ -1408,7 +1408,7 @@ body: |
undef %2.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
early-clobber %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %2, 0, 0, 0, implicit $mode, implicit $exec
early-clobber %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %4
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %4
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
S_BRANCH %bb.2
@@ -1726,7 +1726,7 @@ body: |
; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
@@ -1763,7 +1763,7 @@ body: |
undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
%0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
%4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %4
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %4
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
S_BRANCH %bb.2
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index 9cbdc38..5b3e486 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,16 +8,16 @@
define amdgpu_kernel void @s_input_output_i128() {
; GFX908-LABEL: name: s_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9633802 /* regdef:SGPR_128 */, def %13
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10682378 /* regdef:SGPR_128 */, def %13
; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %13
- ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9633801 /* reguse:SGPR_128 */, [[COPY]]
+ ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10682377 /* reguse:SGPR_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: s_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9633802 /* regdef:SGPR_128 */, def %11
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10682378 /* regdef:SGPR_128 */, def %11
; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11
- ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9633801 /* reguse:SGPR_128 */, [[COPY]]
+ ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10682377 /* reguse:SGPR_128 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=s"()
call void asm sideeffect "; use $0", "s"(i128 %val)
@@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() {
define amdgpu_kernel void @v_input_output_i128() {
; GFX908-LABEL: name: v_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def %13
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def %13
; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %13
- ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7798793 /* reguse:VReg_128 */, [[COPY]]
+ ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:VReg_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: v_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %11
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def %11
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11
- ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7995401 /* reguse:VReg_128_Align2 */, [[COPY]]
+ ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:VReg_128_Align2 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=v"()
call void asm sideeffect "; use $0", "v"(i128 %val)
@@ -47,16 +47,16 @@ define amdgpu_kernel void @a_input_output_i128() {
; GFX908-LABEL: name: a_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8323082 /* regdef:AReg_128 */, def %13
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8847370 /* regdef:AReg_128 */, def %13
; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %13
- ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY]]
+ ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: a_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8650762 /* regdef:AReg_128_Align2 */, def %11
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9568266 /* regdef:AReg_128_Align2 */, def %11
; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11
- ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY]]
+ ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = call i128 asm sideeffect "; def $0", "=a"()
call void asm sideeffect "; use $0", "a"(i128 %val)
diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
index ca77482..fa52b96 100644
--- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
+++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
@@ -1,19 +1,9 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
# RUN: llc -mtriple=amdgcn -run-pass register-coalescer -o - %s | FileCheck %s
-# Check that coalescer does not create wider register tuple than in source
-
-# CHECK: - { id: 2, class: vreg_64, preferred-register: '', flags: [ ] }
-# CHECK: - { id: 3, class: vreg_64, preferred-register: '', flags: [ ] }
-# CHECK: - { id: 4, class: vreg_64, preferred-register: '', flags: [ ] }
-# CHECK: - { id: 5, class: vreg_96, preferred-register: '', flags: [ ] }
-# CHECK: - { id: 6, class: vreg_96, preferred-register: '', flags: [ ] }
-# CHECK: - { id: 7, class: vreg_128, preferred-register: '', flags: [ ] }
-# CHECK: - { id: 8, class: vreg_128, preferred-register: '', flags: [ ] }
+# Check that coalescer does not create wider register tuple than in
+# source.
# No more registers shall be defined
-# CHECK-NEXT: liveins:
-# CHECK: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %4,
-# CHECK: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, %6,
-
---
name: main
alignment: 1
@@ -52,6 +42,23 @@ body: |
bb.0.entry:
liveins: $sgpr0, $vgpr0_vgpr1
+ ; CHECK-LABEL: name: main
+ ; CHECK: liveins: $sgpr0, $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[DEF]].sub0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_96 = IMPLICIT_DEF
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:vreg_96 = COPY [[DEF1]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:vreg_96 = COPY [[DEF]].sub0
+ ; CHECK-NEXT: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF2]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub3:vreg_128 = COPY [[DEF]].sub0
+ ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr
%3 = IMPLICIT_DEF
undef %4.sub0 = COPY $sgpr0
%4.sub1 = COPY %3.sub0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
index de7d234..b9bf76c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
declare i32 @llvm.amdgcn.s.quadmask.i32(i32)
declare i64 @llvm.amdgcn.s.quadmask.i64(i64)
@@ -172,3 +172,91 @@ entry:
%qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %mask)
ret i64 %qm
}
+
+;; Ensure that AND/ICMP cannot be fused into an AND because s_quadmask_b32 implicitly defines SCC.
+define amdgpu_kernel void @test_scc_quadmask_32(i32 %val0, i32 %val1, ptr addrspace(1) %ptr) {
+; GFX11-GISEL-LABEL: test_scc_quadmask_32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_and_b32 s0, s0, 1
+; GFX11-GISEL-NEXT: s_quadmask_b32 s1, s1
+; GFX11-GISEL-NEXT: s_cmp_eq_u32 s0, 0
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, s1
+; GFX11-GISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s0
+; GFX11-GISEL-NEXT: global_store_b32 v2, v3, s[2:3]
+; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v4, off
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: test_scc_quadmask_32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 1
+; GFX11-SDAG-NEXT: s_quadmask_b32 s1, s1
+; GFX11-SDAG-NEXT: s_cmp_eq_u32 s0, 0
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1
+; GFX11-SDAG-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX11-SDAG-NEXT: global_store_b32 v2, v3, s[2:3]
+; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v4, off
+; GFX11-SDAG-NEXT: s_endpgm
+ %and = and i32 %val0, 1
+ %result = call i32 @llvm.amdgcn.s.quadmask.i32(i32 %val1) nounwind readnone
+ store i32 %result, ptr addrspace(1) %ptr
+ %cmp = icmp eq i32 %and, 0
+ %sel = select i1 %cmp, i32 1, i32 0
+ store i32 %sel, ptr addrspace(1) null, align 4
+ ret void
+}
+
+;; Ensure that AND/ICMP cannot be fused into an AND because s_quadmask_b64 implicitly defines SCC.
+define amdgpu_kernel void @test_scc_quadmask_64(i32 %val0, i64 %val1, ptr addrspace(1) %ptr) {
+; GFX11-GISEL-LABEL: test_scc_quadmask_64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_quadmask_b64 s[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_and_b32 s4, s4, 1
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-GISEL-NEXT: s_cmp_eq_u32 s4, 0
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-GISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, s0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-GISEL-NEXT: global_store_b64 v4, v[0:1], s[2:3]
+; GFX11-GISEL-NEXT: global_store_b32 v[2:3], v5, off
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: test_scc_quadmask_64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_and_b32 s4, s6, 1
+; GFX11-SDAG-NEXT: s_quadmask_b64 s[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_cmp_eq_u32 s4, 0
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-SDAG-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3]
+; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v5, off
+; GFX11-SDAG-NEXT: s_endpgm
+ %and = and i32 %val0, 1
+ %result = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %val1) nounwind readnone
+ store i64 %result, ptr addrspace(1) %ptr
+ %cmp = icmp eq i32 %and, 0
+ %sel = select i1 %cmp, i32 1, i32 0
+ store i32 %sel, ptr addrspace(1) null, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll
index 0de7f8f..bd29e9e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
; Regression test for issue 160181
; One variable is chosen to be assigned at zero. Here, that's @both
@@ -22,12 +22,20 @@
;.
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol [[META0:![0-9]+]]
; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+; CHECK: @llvm.amdgcn.kernel.kern_one.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_one.lds.t poison, align 4, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @llvm.amdgcn.kernel.kern_two.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_two.lds.t poison, align 4, !absolute_symbol [[META1]]
+; CHECK: @llvm.amdgcn.kernel.kern_block_direct_allocation.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_block_direct_allocation.lds.t poison, align 4, !absolute_symbol [[META1]]
+
;.
define void @func_one() {
; CHECK-LABEL: define {{[^@]+}}@func_one() {
-; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1:![0-9]+]]
-; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META18:![0-9]+]]
-; CHECK-NEXT: store i16 10, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23:![0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2:![0-9]+]]
+; CHECK-NEXT: [[ONE:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ONE]], align 4
+; CHECK-NEXT: [[ONE1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) [[ONE1]], align 4
+; CHECK-NEXT: store i16 10, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11:![0-9]+]]
; CHECK-NEXT: ret void
;
%val0 = load i32, ptr addrspace(3) @both
@@ -38,9 +46,10 @@ define void @func_one() {
define amdgpu_kernel void @kern_one() {
; CHECK-LABEL: define {{[^@]+}}@kern_one
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META16:![0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !noalias [[META24:![0-9]+]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_one.lds) ]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !noalias [[META17:![0-9]+]]
; CHECK-NEXT: call void @func_one()
; CHECK-NEXT: ret void
;
@@ -51,9 +60,13 @@ entry:
define void @func_two() {
; CHECK-LABEL: define {{[^@]+}}@func_two() {
-; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1]]
-; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 2), align 4, !noalias [[META25:![0-9]+]]
-; CHECK-NEXT: store i16 20, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2]]
+; CHECK-NEXT: [[TWO:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TWO]], align 4
+; CHECK-NEXT: [[TWO1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) [[TWO1]], align 4
+; CHECK-NEXT: store i16 20, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11]]
; CHECK-NEXT: ret void
;
%val0 = load i32, ptr addrspace(3) @both
@@ -64,9 +77,10 @@ define void @func_two() {
define amdgpu_kernel void @kern_two() {
; CHECK-LABEL: define {{[^@]+}}@kern_two
-; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-SAME: () #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META18:![0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META26:![0-9]+]], !noalias [[META27:![0-9]+]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_two.lds) ]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]]
; CHECK-NEXT: call void @func_two()
; CHECK-NEXT: ret void
;
@@ -82,11 +96,18 @@ entry:
; remains the best candidate for address zero allocation.
define void @func_block_direct_allocation() {
; CHECK-LABEL: define {{[^@]+}}@func_block_direct_allocation() {
-; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META18]]
-; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 2), align 4, !noalias [[META25]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[ONE:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ONE]], align 4
+; CHECK-NEXT: [[ONE1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(3) [[ONE1]], align 4
+; CHECK-NEXT: [[TWO:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TWO]], align 4
+; CHECK-NEXT: [[TWO2:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr addrspace(3) [[TWO2]], align 4
; CHECK-NEXT: [[SUM:%.*]] = add i32 [[VAL1]], [[VAL2]]
-; CHECK-NEXT: store i32 [[SUM]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1]]
-; CHECK-NEXT: store i16 30, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23]]
+; CHECK-NEXT: store i32 [[SUM]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2]]
+; CHECK-NEXT: store i16 30, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11]]
; CHECK-NEXT: ret void
;
%val1 = load i32, ptr addrspace(3) @one
@@ -99,7 +120,8 @@ define void @func_block_direct_allocation() {
define amdgpu_kernel void @kern_block_direct_allocation() {
; CHECK-LABEL: define {{[^@]+}}@kern_block_direct_allocation
-; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-SAME: () #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META21:![0-9]+]] {
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_block_direct_allocation.lds) ], !alias.scope [[META22:![0-9]+]], !noalias [[META25:![0-9]+]]
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
; CHECK-NEXT: call void @func_block_direct_allocation()
; CHECK-NEXT: call void @func_one()
@@ -112,35 +134,8 @@ define amdgpu_kernel void @kern_block_direct_allocation() {
ret void
}
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="16" }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-;.
-; CHECK: [[META0]] = !{i32 0, i32 1}
-; CHECK: [[META1]] = !{[[META2:![0-9]+]], [[META4:![0-9]+]], [[META5:![0-9]+]], [[META6:![0-9]+]], [[META8:![0-9]+]], [[META9:![0-9]+]], [[META10:![0-9]+]], [[META12:![0-9]+]], [[META13:![0-9]+]], [[META14:![0-9]+]], [[META16:![0-9]+]], [[META17:![0-9]+]]}
-; CHECK: [[META2]] = distinct !{[[META2]], [[META3:![0-9]+]]}
-; CHECK: [[META3]] = distinct !{[[META3]]}
-; CHECK: [[META4]] = distinct !{[[META4]], [[META3]]}
-; CHECK: [[META5]] = distinct !{[[META5]], [[META3]]}
-; CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]}
-; CHECK: [[META7]] = distinct !{[[META7]]}
-; CHECK: [[META8]] = distinct !{[[META8]], [[META7]]}
-; CHECK: [[META9]] = distinct !{[[META9]], [[META7]]}
-; CHECK: [[META10]] = distinct !{[[META10]], [[META11:![0-9]+]]}
-; CHECK: [[META11]] = distinct !{[[META11]]}
-; CHECK: [[META12]] = distinct !{[[META12]], [[META11]]}
-; CHECK: [[META13]] = distinct !{[[META13]], [[META11]]}
-; CHECK: [[META14]] = distinct !{[[META14]], [[META15:![0-9]+]]}
-; CHECK: [[META15]] = distinct !{[[META15]]}
-; CHECK: [[META16]] = distinct !{[[META16]], [[META15]]}
-; CHECK: [[META17]] = distinct !{[[META17]], [[META15]]}
-; CHECK: [[META18]] = !{[[META19:![0-9]+]], [[META2]], [[META5]], [[META20:![0-9]+]], [[META6]], [[META9]], [[META21:![0-9]+]], [[META10]], [[META13]], [[META22:![0-9]+]], [[META14]], [[META17]]}
-; CHECK: [[META19]] = distinct !{[[META19]], [[META3]]}
-; CHECK: [[META20]] = distinct !{[[META20]], [[META7]]}
-; CHECK: [[META21]] = distinct !{[[META21]], [[META11]]}
-; CHECK: [[META22]] = distinct !{[[META22]], [[META15]]}
-; CHECK: [[META23]] = !{[[META19]], [[META4]], [[META5]], [[META20]], [[META8]], [[META9]], [[META21]], [[META12]], [[META13]], [[META22]], [[META16]], [[META17]]}
-; CHECK: [[META24]] = !{[[META10]], [[META12]], [[META13]], [[META14]], [[META16]], [[META17]]}
-; CHECK: [[META25]] = !{[[META19]], [[META2]], [[META4]], [[META20]], [[META6]], [[META8]], [[META21]], [[META10]], [[META12]], [[META22]], [[META14]], [[META16]]}
-; CHECK: [[META26]] = !{[[META22]]}
-; CHECK: [[META27]] = !{[[META14]], [[META16]], [[META17]]}
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="12" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-lds-size"="16" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 6509d80..f88b1bf 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -12,7 +12,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX908-NEXT: {{ $}}
; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32
- ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def %25
+ ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def %25
; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %25
; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64 */, def %27
; REGALLOC-GFX908-NEXT: SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
@@ -37,7 +37,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0
- ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+ ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
@@ -61,7 +61,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX90A-NEXT: {{ $}}
; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32
- ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %23
+ ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def %23
; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %23
; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def %21
; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY %21
@@ -80,7 +80,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; PEI-GFX90A-NEXT: {{ $}}
; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0
- ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+ ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
index d7b713a..0b4e662 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
@@ -19,7 +19,7 @@ body: |
; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
@@ -30,7 +30,7 @@ body: |
%4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
undef %5.sub0_sub1:areg_128_align2 = COPY %4
%5.sub2_sub3 = IMPLICIT_DEF
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
@@ -172,7 +172,7 @@ body: |
; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
@@ -183,7 +183,7 @@ body: |
undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
undef %5.sub0_sub1:areg_128_align2 = COPY %4.sub2_sub3
%5.sub2_sub3 = IMPLICIT_DEF
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
@@ -208,7 +208,7 @@ body: |
; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2
; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
@@ -219,7 +219,7 @@ body: |
undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
undef %5.sub1:areg_128_align2 = COPY %4.sub2
%5.sub2_sub3 = IMPLICIT_DEF
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
index 57f611b..4c2ea2f 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
@@ -17,7 +17,7 @@ body: |
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
%0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -26,7 +26,7 @@ body: |
%3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
%4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
undef %5.sub0_sub1:areg_128_align2 = COPY %4
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
...
@@ -47,7 +47,7 @@ body: |
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
%0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -56,7 +56,7 @@ body: |
%3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
%4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
undef %5.sub0_sub1:areg_128_align2 = COPY %4
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
...
@@ -151,7 +151,7 @@ body: |
; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1
; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
%0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -163,7 +163,7 @@ body: |
%other_use:vreg_64_align2 = COPY %5.sub0_sub1
%6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
undef %8.sub0_sub1:areg_128_align2 = COPY %6
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %8:areg_128_align2
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %8:areg_128_align2
GLOBAL_STORE_DWORDX4 %0, %8, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
@@ -231,7 +231,7 @@ body: |
; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: SI_RETURN
%0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -245,7 +245,7 @@ body: |
%other_use1:vreg_64_align2 = COPY %4.sub2_sub3
%other_use2:vreg_64 = COPY %4.sub1_sub2
%6:areg_128_align2 = COPY %4
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %6:areg_128_align2
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %6:areg_128_align2
GLOBAL_STORE_DWORDX4 %0, %6, 0, 0, implicit $exec :: (store (s128), addrspace 1)
SI_RETURN
...
diff --git a/llvm/test/CodeGen/ARM/and-mask-variable.ll b/llvm/test/CodeGen/ARM/and-mask-variable.ll
new file mode 100644
index 0000000..0f84b76
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/and-mask-variable.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s --check-prefix V7M
+; RUN: llc -mtriple=armv7a-eabi %s -o - | FileCheck %s --check-prefix V7A
+; RUN: llc -mtriple=thumbv7a-eabi %s -o - | FileCheck %s --check-prefix V7A-T
+; RUN: llc -mtriple=armv6m-eabi %s -o - | FileCheck %s --check-prefix V6M
+
+define i32 @mask_pair(i32 %x, i32 %y) {
+; V7M-LABEL: mask_pair:
+; V7M: @ %bb.0:
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: mask_pair:
+; V7A: @ %bb.0:
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: mask_pair:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: mask_pair:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: bx lr
+ %shl = shl nsw i32 -1, %y
+ %and = and i32 %shl, %x
+ ret i32 %and
+}
+
+define i64 @mask_pair_64(i64 %x, i64 %y) {
+; V7M-LABEL: mask_pair_64:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsl.w r12, r3, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl.w r12, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl r3, r2
+; V7M-NEXT: and.w r0, r0, r12
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: mask_pair_64:
+; V7A: @ %bb.0:
+; V7A-NEXT: subs r12, r2, #32
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsl r2, r3, r2
+; V7A-NEXT: lslpl r3, r3, r12
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: and r1, r3, r1
+; V7A-NEXT: and r0, r2, r0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: mask_pair_64:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsl.w r12, r3, r2
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl.w r12, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl r3, r2
+; V7A-T-NEXT: and.w r0, r0, r12
+; V7A-T-NEXT: ands r1, r3
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: mask_pair_64:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %shl = shl nsw i64 -1, %y
+ %and = and i64 %shl, %x
+ ret i64 %and
+}
diff --git a/llvm/test/CodeGen/ARM/extract-bits.ll b/llvm/test/CodeGen/ARM/extract-bits.ll
new file mode 100644
index 0000000..77deaa5
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/extract-bits.ll
@@ -0,0 +1,4591 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s --check-prefix V7M
+; RUN: llc -mtriple=armv7a-eabi %s -o - | FileCheck %s --check-prefix V7A
+; RUN: llc -mtriple=thumbv7a-eabi %s -o - | FileCheck %s --check-prefix V7A-T
+; RUN: llc -mtriple=armv6m-eabi %s -o - | FileCheck %s --check-prefix V6M
+
+; Patterns:
+; a) (x >> start) & (1 << nbits) - 1
+; b) (x >> start) & ~(-1 << nbits)
+; c) (x >> start) & (-1 >> (32 - y))
+; d) (x >> start) << (32 - y) >> (32 - y)
+; are equivalent.
+
+; ---------------------------------------------------------------------------- ;
+; Pattern a. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a0:
+; V7M: @ %bb.0:
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: movs r1, #1
+; V7M-NEXT: lsls r1, r2
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_a0:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r2, r3, r12, lsl r2
+; V7A-NEXT: and r0, r2, r0, lsr r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_a0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsls r1, r2
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_a0:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #1
+; V6M-NEXT: lsls r1, r2
+; V6M-NEXT: subs r1, r1, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %shifted = lshr i32 %val, %numskipbits
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a0_arithmetic:
+; V7M: @ %bb.0:
+; V7M-NEXT: asrs r0, r1
+; V7M-NEXT: movs r1, #1
+; V7M-NEXT: lsls r1, r2
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_a0_arithmetic:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r2, r3, r12, lsl r2
+; V7A-NEXT: and r0, r2, r0, asr r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_a0_arithmetic:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: asrs r0, r1
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsls r1, r2
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_a0_arithmetic:
+; V6M: @ %bb.0:
+; V6M-NEXT: asrs r0, r1
+; V6M-NEXT: movs r1, #1
+; V6M-NEXT: lsls r1, r2
+; V6M-NEXT: subs r1, r1, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %shifted = ashr i32 %val, %numskipbits
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: movs r1, #1
+; V7M-NEXT: lsls r1, r2
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_a1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r2, r3, r12, lsl r2
+; V7A-NEXT: and r0, r2, r0, lsr r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_a1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsls r1, r2
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_a1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #1
+; V6M-NEXT: lsls r1, r2
+; V6M-NEXT: subs r1, r1, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %skip = zext i8 %numskipbits to i32
+ %shifted = lshr i32 %val, %skip
+ %conv = zext i8 %numlowbits to i32
+ %onebit = shl i32 1, %conv
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_a2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: movs r1, #1
+; V7M-NEXT: lsls r1, r2
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_a2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r2, r3, r12, lsl r2
+; V7A-NEXT: and r0, r2, r0, lsr r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_a2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsls r1, r2
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_a2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r3, [r0]
+; V6M-NEXT: lsrs r3, r1
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: lsls r0, r2
+; V6M-NEXT: subs r0, r0, #1
+; V6M-NEXT: ands r0, r3
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %shifted = lshr i32 %val, %numskipbits
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: movs r1, #1
+; V7M-NEXT: lsls r1, r2
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_a3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r2, r3, r12, lsl r2
+; V7A-NEXT: and r0, r2, r0, lsr r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_a3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsls r1, r2
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_a3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r3, [r0]
+; V6M-NEXT: lsrs r3, r1
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: lsls r0, r2
+; V6M-NEXT: subs r0, r0, #1
+; V6M-NEXT: ands r0, r3
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %skip = zext i8 %numskipbits to i32
+ %shifted = lshr i32 %val, %skip
+ %conv = zext i8 %numlowbits to i32
+ %onebit = shl i32 1, %conv
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: movs r1, #1
+; V7M-NEXT: lsls r1, r2
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_a4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r2, r3, r12, lsl r2
+; V7A-NEXT: and r0, r2, r0, lsr r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_a4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsls r1, r2
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_a4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #1
+; V6M-NEXT: lsls r1, r2
+; V6M-NEXT: subs r1, r1, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %shifted = lshr i32 %val, %numskipbits
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %shifted, %mask ; swapped order
+ ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a0:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: mov.w lr, #1
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: rsb.w r4, r12, #32
+; V7M-NEXT: subs.w r3, r12, #32
+; V7M-NEXT: lsr.w r4, lr, r4
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r4, lr, r3
+; V7M-NEXT: lsl.w r3, lr, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: subs r3, #1
+; V7M-NEXT: sbc r12, r4, #0
+; V7M-NEXT: rsb.w r4, r2, #32
+; V7M-NEXT: lsl.w r4, r1, r4
+; V7M-NEXT: orrs r0, r4
+; V7M-NEXT: subs.w r4, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r4
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: and.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: and.w r1, r1, r12
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_a0:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, lr}
+; V7A-NEXT: push {r4, lr}
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: mov lr, #1
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: rsb r3, r12, #32
+; V7A-NEXT: subs r4, r12, #32
+; V7A-NEXT: lsr r3, lr, r3
+; V7A-NEXT: lslpl r3, lr, r4
+; V7A-NEXT: lsl r4, lr, r12
+; V7A-NEXT: movwpl r4, #0
+; V7A-NEXT: subs r4, r4, #1
+; V7A-NEXT: sbc r12, r3, #0
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsrpl r0, r1, r3
+; V7A-NEXT: lsr r1, r1, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: and r0, r4, r0
+; V7A-NEXT: and r1, r12, r1
+; V7A-NEXT: pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_a0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: mov.w lr, #1
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: rsb.w r4, r12, #32
+; V7A-T-NEXT: subs.w r3, r12, #32
+; V7A-T-NEXT: lsr.w r4, lr, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r4, lr, r3
+; V7A-T-NEXT: lsl.w r3, lr, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r3, #0
+; V7A-T-NEXT: subs r3, #1
+; V7A-T-NEXT: sbc r12, r4, #0
+; V7A-T-NEXT: rsb.w r4, r2, #32
+; V7A-T-NEXT: lsl.w r4, r1, r4
+; V7A-T-NEXT: orrs r0, r4
+; V7A-T-NEXT: subs.w r4, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r4
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: and.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: and.w r1, r1, r12
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, r7, lr}
+; V6M-NEXT: push {r4, r5, r6, r7, lr}
+; V6M-NEXT: .pad #12
+; V6M-NEXT: sub sp, #12
+; V6M-NEXT: str r2, [sp, #8] @ 4-byte Spill
+; V6M-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; V6M-NEXT: mov r6, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r7, #0
+; V6M-NEXT: ldr r2, [sp, #32]
+; V6M-NEXT: mov r1, r7
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: subs r5, r0, #1
+; V6M-NEXT: sbcs r4, r7
+; V6M-NEXT: mov r0, r6
+; V6M-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; V6M-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: add sp, #12
+; V6M-NEXT: pop {r4, r5, r6, r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %onebit = shl i64 1, %numlowbits
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a0_arithmetic:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: mov.w lr, #1
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: rsb.w r4, r12, #32
+; V7M-NEXT: subs.w r3, r12, #32
+; V7M-NEXT: lsr.w r4, lr, r4
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r4, lr, r3
+; V7M-NEXT: lsl.w r3, lr, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: subs r3, #1
+; V7M-NEXT: sbc r12, r4, #0
+; V7M-NEXT: rsb.w r4, r2, #32
+; V7M-NEXT: lsl.w r4, r1, r4
+; V7M-NEXT: orrs r0, r4
+; V7M-NEXT: subs.w r4, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: asrpl.w r0, r1, r4
+; V7M-NEXT: asr.w r2, r1, r2
+; V7M-NEXT: and.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: asrpl r2, r1, #31
+; V7M-NEXT: and.w r1, r12, r2
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_a0_arithmetic:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, lr}
+; V7A-NEXT: push {r4, lr}
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: mov lr, #1
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: rsb r3, r12, #32
+; V7A-NEXT: subs r4, r12, #32
+; V7A-NEXT: lsr r3, lr, r3
+; V7A-NEXT: lslpl r3, lr, r4
+; V7A-NEXT: lsl r4, lr, r12
+; V7A-NEXT: movwpl r4, #0
+; V7A-NEXT: subs r4, r4, #1
+; V7A-NEXT: sbc r12, r3, #0
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: asr r2, r1, r2
+; V7A-NEXT: asrpl r0, r1, r3
+; V7A-NEXT: asrpl r2, r1, #31
+; V7A-NEXT: and r0, r4, r0
+; V7A-NEXT: and r1, r12, r2
+; V7A-NEXT: pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_a0_arithmetic:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: mov.w lr, #1
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: rsb.w r4, r12, #32
+; V7A-T-NEXT: subs.w r3, r12, #32
+; V7A-T-NEXT: lsr.w r4, lr, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r4, lr, r3
+; V7A-T-NEXT: lsl.w r3, lr, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r3, #0
+; V7A-T-NEXT: subs r3, #1
+; V7A-T-NEXT: sbc r12, r4, #0
+; V7A-T-NEXT: rsb.w r4, r2, #32
+; V7A-T-NEXT: lsl.w r4, r1, r4
+; V7A-T-NEXT: orrs r0, r4
+; V7A-T-NEXT: subs.w r4, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: asrpl.w r0, r1, r4
+; V7A-T-NEXT: asr.w r2, r1, r2
+; V7A-T-NEXT: and.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: asrpl r2, r1, #31
+; V7A-T-NEXT: and.w r1, r12, r2
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a0_arithmetic:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, r7, lr}
+; V6M-NEXT: push {r4, r5, r6, r7, lr}
+; V6M-NEXT: .pad #12
+; V6M-NEXT: sub sp, #12
+; V6M-NEXT: str r2, [sp, #8] @ 4-byte Spill
+; V6M-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; V6M-NEXT: mov r6, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r7, #0
+; V6M-NEXT: ldr r2, [sp, #32]
+; V6M-NEXT: mov r1, r7
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: subs r5, r0, #1
+; V6M-NEXT: sbcs r4, r7
+; V6M-NEXT: mov r0, r6
+; V6M-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; V6M-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
+; V6M-NEXT: bl __aeabi_lasr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: add sp, #12
+; V6M-NEXT: pop {r4, r5, r6, r7, pc}
+ %shifted = ashr i64 %val, %numskipbits
+ %onebit = shl i64 1, %numlowbits
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: rsb.w r4, r3, #32
+; V7M-NEXT: mov.w lr, #1
+; V7M-NEXT: subs.w r12, r3, #32
+; V7M-NEXT: lsl.w r3, lr, r3
+; V7M-NEXT: lsr.w r4, lr, r4
+; V7M-NEXT: lsr.w r0, r0, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r4, lr, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: subs r3, #1
+; V7M-NEXT: sbc r12, r4, #0
+; V7M-NEXT: rsb.w r4, r2, #32
+; V7M-NEXT: lsl.w r4, r1, r4
+; V7M-NEXT: orrs r0, r4
+; V7M-NEXT: subs.w r4, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r4
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: and.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: and.w r1, r1, r12
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_a1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, lr}
+; V7A-NEXT: push {r4, lr}
+; V7A-NEXT: rsb r12, r3, #32
+; V7A-NEXT: mov lr, #1
+; V7A-NEXT: subs r4, r3, #32
+; V7A-NEXT: lsl r3, lr, r3
+; V7A-NEXT: lsr r12, lr, r12
+; V7A-NEXT: movwpl r3, #0
+; V7A-NEXT: lslpl r12, lr, r4
+; V7A-NEXT: rsb r4, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: subs r3, r3, #1
+; V7A-NEXT: sbc r12, r12, #0
+; V7A-NEXT: orr r0, r0, r1, lsl r4
+; V7A-NEXT: subs r4, r2, #32
+; V7A-NEXT: lsrpl r0, r1, r4
+; V7A-NEXT: lsr r1, r1, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: and r0, r3, r0
+; V7A-NEXT: and r1, r12, r1
+; V7A-NEXT: pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_a1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: rsb.w r4, r3, #32
+; V7A-T-NEXT: mov.w lr, #1
+; V7A-T-NEXT: subs.w r12, r3, #32
+; V7A-T-NEXT: lsl.w r3, lr, r3
+; V7A-T-NEXT: lsr.w r4, lr, r4
+; V7A-T-NEXT: lsr.w r0, r0, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r4, lr, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r3, #0
+; V7A-T-NEXT: subs r3, #1
+; V7A-T-NEXT: sbc r12, r4, #0
+; V7A-T-NEXT: rsb.w r4, r2, #32
+; V7A-T-NEXT: lsl.w r4, r1, r4
+; V7A-T-NEXT: orrs r0, r4
+; V7A-T-NEXT: subs.w r4, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r4
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: and.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: and.w r1, r1, r12
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, r7, lr}
+; V6M-NEXT: push {r4, r5, r6, r7, lr}
+; V6M-NEXT: .pad #12
+; V6M-NEXT: sub sp, #12
+; V6M-NEXT: str r2, [sp, #8] @ 4-byte Spill
+; V6M-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; V6M-NEXT: mov r6, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r7, #0
+; V6M-NEXT: mov r1, r7
+; V6M-NEXT: mov r2, r3
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: subs r5, r0, #1
+; V6M-NEXT: sbcs r4, r7
+; V6M-NEXT: mov r0, r6
+; V6M-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; V6M-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: add sp, #12
+; V6M-NEXT: pop {r4, r5, r6, r7, pc}
+ %skip = zext i8 %numskipbits to i64
+ %shifted = lshr i64 %val, %skip
+ %conv = zext i8 %numlowbits to i64
+ %onebit = shl i64 1, %conv
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: mov.w lr, #1
+; V7M-NEXT: rsb.w r1, r12, #32
+; V7M-NEXT: subs.w r3, r12, #32
+; V7M-NEXT: lsr.w r1, lr, r1
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, lr, r3
+; V7M-NEXT: lsl.w r3, lr, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: subs.w lr, r3, #1
+; V7M-NEXT: ldrd r0, r3, [r0]
+; V7M-NEXT: sbc r12, r1, #0
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: lsl.w r1, r3, r1
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: orrs r0, r1
+; V7M-NEXT: subs.w r1, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r3, r1
+; V7M-NEXT: lsr.w r1, r3, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: and.w r0, r0, lr
+; V7M-NEXT: and.w r1, r1, r12
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bextr64_a2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r5, r6, lr}
+; V7A-NEXT: push {r4, r5, r6, lr}
+; V7A-NEXT: ldr r1, [sp, #16]
+; V7A-NEXT: mov r3, #1
+; V7A-NEXT: ldr r6, [r0]
+; V7A-NEXT: ldr r5, [r0, #4]
+; V7A-NEXT: rsb r0, r1, #32
+; V7A-NEXT: subs r4, r1, #32
+; V7A-NEXT: lsl r1, r3, r1
+; V7A-NEXT: lsr r0, r3, r0
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: lslpl r0, r3, r4
+; V7A-NEXT: subs r1, r1, #1
+; V7A-NEXT: sbc r3, r0, #0
+; V7A-NEXT: lsr r0, r6, r2
+; V7A-NEXT: rsb r6, r2, #32
+; V7A-NEXT: orr r0, r0, r5, lsl r6
+; V7A-NEXT: subs r6, r2, #32
+; V7A-NEXT: lsrpl r0, r5, r6
+; V7A-NEXT: and r0, r1, r0
+; V7A-NEXT: lsr r1, r5, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: and r1, r3, r1
+; V7A-NEXT: pop {r4, r5, r6, pc}
+;
+; V7A-T-LABEL: bextr64_a2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: movs r3, #1
+; V7A-T-NEXT: ldrd lr, r1, [r0]
+; V7A-T-NEXT: rsb.w r4, r12, #32
+; V7A-T-NEXT: subs.w r0, r12, #32
+; V7A-T-NEXT: lsr.w r4, r3, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r4, r3, r0
+; V7A-T-NEXT: lsl.w r0, r3, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsr.w r3, lr, r2
+; V7A-T-NEXT: subs r0, #1
+; V7A-T-NEXT: sbc r12, r4, #0
+; V7A-T-NEXT: rsb.w r4, r2, #32
+; V7A-T-NEXT: lsl.w r4, r1, r4
+; V7A-T-NEXT: orrs r3, r4
+; V7A-T-NEXT: subs.w r4, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r3, r1, r4
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: and.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: and.w r1, r1, r12
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, r7, lr}
+; V6M-NEXT: push {r4, r5, r6, r7, lr}
+; V6M-NEXT: .pad #4
+; V6M-NEXT: sub sp, #4
+; V6M-NEXT: str r2, [sp] @ 4-byte Spill
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r7, #0
+; V6M-NEXT: ldr r2, [sp, #24]
+; V6M-NEXT: mov r1, r7
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r6, r1
+; V6M-NEXT: subs r4, r0, #1
+; V6M-NEXT: sbcs r6, r7
+; V6M-NEXT: ldm r5!, {r0, r1}
+; V6M-NEXT: ldr r2, [sp] @ 4-byte Reload
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r4
+; V6M-NEXT: ands r1, r6
+; V6M-NEXT: add sp, #4
+; V6M-NEXT: pop {r4, r5, r6, r7, pc}
+ %val = load i64, ptr %w
+ %shifted = lshr i64 %val, %numskipbits
+ %onebit = shl i64 1, %numlowbits
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: mov.w r12, #1
+; V7M-NEXT: subs.w lr, r2, #32
+; V7M-NEXT: lsl.w r2, r12, r2
+; V7M-NEXT: lsr.w r3, r12, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r3, r12, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: subs.w lr, r2, #1
+; V7M-NEXT: ldrd r0, r2, [r0]
+; V7M-NEXT: sbc r12, r3, #0
+; V7M-NEXT: rsb.w r3, r1, #32
+; V7M-NEXT: lsl.w r3, r2, r3
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: orrs r0, r3
+; V7M-NEXT: subs.w r3, r1, #32
+; V7M-NEXT: lsr.w r1, r2, r1
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r2, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: and.w r0, r0, lr
+; V7M-NEXT: and.w r1, r1, r12
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bextr64_a3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r5, r6, lr}
+; V7A-NEXT: push {r4, r5, r6, lr}
+; V7A-NEXT: ldr r6, [r0]
+; V7A-NEXT: mov r3, #1
+; V7A-NEXT: ldr r5, [r0, #4]
+; V7A-NEXT: rsb r0, r2, #32
+; V7A-NEXT: subs r4, r2, #32
+; V7A-NEXT: lsl r2, r3, r2
+; V7A-NEXT: lsr r0, r3, r0
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: lslpl r0, r3, r4
+; V7A-NEXT: subs r3, r2, #1
+; V7A-NEXT: sbc r0, r0, #0
+; V7A-NEXT: lsr r2, r5, r1
+; V7A-NEXT: subs r4, r1, #32
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: and r2, r0, r2
+; V7A-NEXT: lsr r0, r6, r1
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: orr r0, r0, r5, lsl r1
+; V7A-NEXT: mov r1, r2
+; V7A-NEXT: lsrpl r0, r5, r4
+; V7A-NEXT: and r0, r3, r0
+; V7A-NEXT: pop {r4, r5, r6, pc}
+;
+; V7A-T-LABEL: bextr64_a3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: rsb.w r4, r2, #32
+; V7A-T-NEXT: mov.w lr, #1
+; V7A-T-NEXT: subs.w r3, r2, #32
+; V7A-T-NEXT: lsl.w r2, lr, r2
+; V7A-T-NEXT: lsr.w r4, lr, r4
+; V7A-T-NEXT: ldrd r12, r0, [r0]
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r4, lr, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: subs.w lr, r2, #1
+; V7A-T-NEXT: sbc r2, r4, #0
+; V7A-T-NEXT: lsr.w r4, r0, r1
+; V7A-T-NEXT: subs.w r3, r1, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r4, #0
+; V7A-T-NEXT: and.w r2, r2, r4
+; V7A-T-NEXT: rsb.w r4, r1, #32
+; V7A-T-NEXT: lsr.w r1, r12, r1
+; V7A-T-NEXT: lsl.w r4, r0, r4
+; V7A-T-NEXT: orr.w r1, r1, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r1, r0, r3
+; V7A-T-NEXT: and.w r0, lr, r1
+; V7A-T-NEXT: mov r1, r2
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, r7, lr}
+; V6M-NEXT: push {r4, r5, r6, r7, lr}
+; V6M-NEXT: .pad #4
+; V6M-NEXT: sub sp, #4
+; V6M-NEXT: str r1, [sp] @ 4-byte Spill
+; V6M-NEXT: mov r6, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r7, #0
+; V6M-NEXT: mov r1, r7
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r5, r1
+; V6M-NEXT: subs r4, r0, #1
+; V6M-NEXT: sbcs r5, r7
+; V6M-NEXT: ldm r6!, {r0, r1}
+; V6M-NEXT: ldr r2, [sp] @ 4-byte Reload
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r4
+; V6M-NEXT: ands r1, r5
+; V6M-NEXT: add sp, #4
+; V6M-NEXT: pop {r4, r5, r6, r7, pc}
+ %val = load i64, ptr %w
+ %skip = zext i8 %numskipbits to i64
+ %shifted = lshr i64 %val, %skip
+ %conv = zext i8 %numlowbits to i64
+ %onebit = shl i64 1, %conv
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: mov.w lr, #1
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: rsb.w r4, r12, #32
+; V7M-NEXT: subs.w r3, r12, #32
+; V7M-NEXT: lsr.w r4, lr, r4
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r4, lr, r3
+; V7M-NEXT: lsl.w r3, lr, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: subs r3, #1
+; V7M-NEXT: sbc r12, r4, #0
+; V7M-NEXT: rsb.w r4, r2, #32
+; V7M-NEXT: lsl.w r4, r1, r4
+; V7M-NEXT: orrs r0, r4
+; V7M-NEXT: subs.w r4, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r4
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: and.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: and.w r1, r1, r12
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_a4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, lr}
+; V7A-NEXT: push {r4, lr}
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: mov lr, #1
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: rsb r3, r12, #32
+; V7A-NEXT: subs r4, r12, #32
+; V7A-NEXT: lsr r3, lr, r3
+; V7A-NEXT: lslpl r3, lr, r4
+; V7A-NEXT: lsl r4, lr, r12
+; V7A-NEXT: movwpl r4, #0
+; V7A-NEXT: subs r4, r4, #1
+; V7A-NEXT: sbc r12, r3, #0
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsrpl r0, r1, r3
+; V7A-NEXT: lsr r1, r1, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: and r0, r0, r4
+; V7A-NEXT: and r1, r1, r12
+; V7A-NEXT: pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_a4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: mov.w lr, #1
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: rsb.w r4, r12, #32
+; V7A-T-NEXT: subs.w r3, r12, #32
+; V7A-T-NEXT: lsr.w r4, lr, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r4, lr, r3
+; V7A-T-NEXT: lsl.w r3, lr, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r3, #0
+; V7A-T-NEXT: subs r3, #1
+; V7A-T-NEXT: sbc r12, r4, #0
+; V7A-T-NEXT: rsb.w r4, r2, #32
+; V7A-T-NEXT: lsl.w r4, r1, r4
+; V7A-T-NEXT: orrs r0, r4
+; V7A-T-NEXT: subs.w r4, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r4
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: and.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: and.w r1, r1, r12
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, r7, lr}
+; V6M-NEXT: push {r4, r5, r6, r7, lr}
+; V6M-NEXT: .pad #12
+; V6M-NEXT: sub sp, #12
+; V6M-NEXT: str r2, [sp, #8] @ 4-byte Spill
+; V6M-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; V6M-NEXT: mov r6, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r7, #0
+; V6M-NEXT: ldr r2, [sp, #32]
+; V6M-NEXT: mov r1, r7
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: subs r5, r0, #1
+; V6M-NEXT: sbcs r4, r7
+; V6M-NEXT: mov r0, r6
+; V6M-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; V6M-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: add sp, #12
+; V6M-NEXT: pop {r4, r5, r6, r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %onebit = shl i64 1, %numlowbits
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %shifted, %mask ; swapped order
+ ret i64 %masked
+}
+
+; 64-bit, but with 32-bit output
+
+; Everything done in 64-bit, truncation happens last.
+define i32 @bextr64_32_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_a0:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldr r1, [sp]
+; V7M-NEXT: movs r2, #1
+; V7M-NEXT: lsls r2, r1
+; V7M-NEXT: subs r1, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: subs r1, r2, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_a0:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldr r12, [sp]
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: mov r1, #1
+; V7A-NEXT: lsl r1, r1, r12
+; V7A-NEXT: subs r2, r12, #32
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: sub r1, r1, #1
+; V7A-NEXT: and r0, r1, r0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_32_a0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsl.w r1, r1, r12
+; V7A-T-NEXT: subs.w r2, r12, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_a0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: ldr r2, [sp, #8]
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: subs r0, r0, #1
+; V6M-NEXT: ands r0, r4
+; V6M-NEXT: pop {r4, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %onebit = shl i64 1, %numlowbits
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %shifted
+ %res = trunc i64 %masked to i32
+ ret i32 %res
+}
+
+; Shifting happens in 64-bit, then truncation. Masking is 32-bit.
+define i32 @bextr64_32_a1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_a1:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldr r1, [sp]
+; V7M-NEXT: movs r2, #1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_a1:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: mov lr, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: add r12, r3, lr, lsl r12
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: and r0, r12, r0
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_32_a1:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsl.w r1, r1, r12
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_a1:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r7, lr}
+; V6M-NEXT: push {r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldr r1, [sp, #8]
+; V6M-NEXT: movs r2, #1
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: subs r1, r2, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: pop {r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %truncshifted = trunc i64 %shifted to i32
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %truncshifted
+ ret i32 %masked
+}
+
+; Shifting happens in 64-bit. Mask is 32-bit, but extended to 64-bit.
+; Masking is 64-bit. Then truncation.
+define i32 @bextr64_32_a2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_a2:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldr r1, [sp]
+; V7M-NEXT: movs r2, #1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_a2:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: mov lr, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: add r12, r3, lr, lsl r12
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: and r0, r12, r0
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_32_a2:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: lsl.w r1, r1, r12
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_a2:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r7, lr}
+; V6M-NEXT: push {r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldr r1, [sp, #8]
+; V6M-NEXT: movs r2, #1
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: subs r1, r2, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: pop {r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %zextmask = zext i32 %mask to i64
+ %masked = and i64 %zextmask, %shifted
+ %truncmasked = trunc i64 %masked to i32
+ ret i32 %truncmasked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern b. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_b0:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: lsl.w r2, r3, r2
+; V7M-NEXT: bics r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_b0:
+; V7A: @ %bb.0:
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: bic r0, r0, r1, lsl r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_b0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: lsl.w r2, r3, r2
+; V7A-T-NEXT: bics r0, r2
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_b0:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: mvns r1, r1
+; V6M-NEXT: lsls r1, r2
+; V6M-NEXT: bics r0, r1
+; V6M-NEXT: bx lr
+ %shifted = lshr i32 %val, %numskipbits
+ %notmask = shl i32 -1, %numlowbits
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr32_b1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: lsl.w r2, r3, r2
+; V7M-NEXT: bics r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_b1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: bic r0, r0, r1, lsl r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_b1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: lsl.w r2, r3, r2
+; V7A-T-NEXT: bics r0, r2
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_b1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: mvns r1, r1
+; V6M-NEXT: lsls r1, r2
+; V6M-NEXT: bics r0, r1
+; V6M-NEXT: bx lr
+ %skip = zext i8 %numskipbits to i32
+ %shifted = lshr i32 %val, %skip
+ %conv = zext i8 %numlowbits to i32
+ %notmask = shl i32 -1, %conv
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_b2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_b2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsl.w r2, r3, r2
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bics r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_b2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: bic r0, r0, r1, lsl r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_b2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsl.w r2, r3, r2
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bics r0, r2
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_b2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r3, #0
+; V6M-NEXT: mvns r3, r3
+; V6M-NEXT: lsls r3, r2
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bics r0, r3
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %shifted = lshr i32 %val, %numskipbits
+ %notmask = shl i32 -1, %numlowbits
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr32_b3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsl.w r2, r3, r2
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bics r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_b3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: bic r0, r0, r1, lsl r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_b3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsl.w r2, r3, r2
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bics r0, r2
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_b3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r3, #0
+; V6M-NEXT: mvns r3, r3
+; V6M-NEXT: lsls r3, r2
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bics r0, r3
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %skip = zext i8 %numskipbits to i32
+ %shifted = lshr i32 %val, %skip
+ %conv = zext i8 %numlowbits to i32
+ %notmask = shl i32 -1, %conv
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_b4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: lsl.w r2, r3, r2
+; V7M-NEXT: bics r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_b4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: bic r0, r0, r1, lsl r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_b4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: lsl.w r2, r3, r2
+; V7A-T-NEXT: bics r0, r2
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_b4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: mvns r1, r1
+; V6M-NEXT: lsls r1, r2
+; V6M-NEXT: bics r0, r1
+; V6M-NEXT: bx lr
+ %shifted = lshr i32 %val, %numskipbits
+ %notmask = shl i32 -1, %numlowbits
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %shifted, %mask ; swapped order
+ ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_b0:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orrs r0, r3
+; V7M-NEXT: subs.w r3, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: lsl.w r3, r2, r12
+; V7M-NEXT: subs.w lr, r12, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r2, r2, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: bics r1, r2
+; V7M-NEXT: bics r0, r3
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bextr64_b0:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsrpl r0, r1, r3
+; V7A-NEXT: lsr r1, r1, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: subs lr, r12, #32
+; V7A-NEXT: lsl r2, r3, r12
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: bic r0, r0, r2
+; V7A-NEXT: lslpl r3, r3, lr
+; V7A-NEXT: bic r1, r1, r3
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_b0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, r5, r7, lr}
+; V7A-T-NEXT: push {r4, r5, r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: ldr.w r12, [sp, #16]
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r5, r0, r3
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: subs.w lr, r12, #32
+; V7A-T-NEXT: lsl.w r0, r3, r12
+; V7A-T-NEXT: itt pl
+; V7A-T-NEXT: lslpl.w r3, r3, lr
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: subs.w r4, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r5, r1, r4
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: bic.w r0, r5, r0
+; V7A-T-NEXT: bics r1, r3
+; V7A-T-NEXT: pop {r4, r5, r7, pc}
+;
+; V6M-LABEL: bextr64_b0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: mov r5, r1
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: ldr r2, [sp, #16]
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r4, r0
+; V6M-NEXT: bics r5, r1
+; V6M-NEXT: mov r0, r4
+; V6M-NEXT: mov r1, r5
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %notmask = shl i64 -1, %numlowbits
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr64_b1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: lsr.w r12, r0, r2
+; V7M-NEXT: rsb.w r0, r2, #32
+; V7M-NEXT: lsl.w r0, r1, r0
+; V7M-NEXT: orr.w r12, r12, r0
+; V7M-NEXT: subs.w r0, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r12, r1, r0
+; V7M-NEXT: lsr.w r0, r1, r2
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: subs.w r1, r3, #32
+; V7M-NEXT: lsl.w r3, r2, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl r2, r1
+; V7M-NEXT: bic.w r1, r0, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: bic.w r0, r12, r3
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_b1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: lsr r12, r0, r2
+; V7A-NEXT: rsb r0, r2, #32
+; V7A-NEXT: orr r12, r12, r1, lsl r0
+; V7A-NEXT: subs r0, r2, #32
+; V7A-NEXT: lsrpl r12, r1, r0
+; V7A-NEXT: lsr r0, r1, r2
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: subs r1, r3, #32
+; V7A-NEXT: mvn r2, #0
+; V7A-NEXT: lsl r3, r2, r3
+; V7A-NEXT: lslpl r2, r2, r1
+; V7A-NEXT: bic r1, r0, r2
+; V7A-NEXT: movwpl r3, #0
+; V7A-NEXT: bic r0, r12, r3
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_b1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsr.w r12, r0, r2
+; V7A-T-NEXT: rsb.w r0, r2, #32
+; V7A-T-NEXT: lsl.w r0, r1, r0
+; V7A-T-NEXT: orr.w r12, r12, r0
+; V7A-T-NEXT: subs.w r0, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r12, r1, r0
+; V7A-T-NEXT: lsr.w r0, r1, r2
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: subs.w r1, r3, #32
+; V7A-T-NEXT: lsl.w r3, r2, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl r2, r1
+; V7A-T-NEXT: bic.w r1, r0, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r3, #0
+; V7A-T-NEXT: bic.w r0, r12, r3
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_b1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, lr}
+; V6M-NEXT: push {r4, r5, r6, lr}
+; V6M-NEXT: mov r4, r3
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: mov r6, r1
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r5, r0
+; V6M-NEXT: bics r6, r1
+; V6M-NEXT: mov r0, r5
+; V6M-NEXT: mov r1, r6
+; V6M-NEXT: pop {r4, r5, r6, pc}
+ %skip = zext i8 %numskipbits to i64
+ %shifted = lshr i64 %val, %skip
+ %conv = zext i8 %numlowbits to i64
+ %notmask = shl i64 -1, %conv
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_b2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_b2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: ldrd r0, r3, [r0]
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: lsl.w r1, r3, r1
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: orrs r0, r1
+; V7M-NEXT: subs.w r1, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r3, r1
+; V7M-NEXT: lsr.w r1, r3, r2
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: lsl.w r3, r2, r12
+; V7M-NEXT: subs.w lr, r12, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r2, r2, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: bics r1, r2
+; V7M-NEXT: bics r0, r3
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bextr64_b2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: ldrd r0, r1, [r0]
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsrpl r0, r1, r3
+; V7A-NEXT: lsr r1, r1, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: subs lr, r12, #32
+; V7A-NEXT: lsl r2, r3, r12
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: bic r0, r0, r2
+; V7A-NEXT: lslpl r3, r3, lr
+; V7A-NEXT: bic r1, r1, r3
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_b2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: ldrd r0, r3, [r0]
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: lsl.w r1, r3, r1
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: orrs r0, r1
+; V7A-T-NEXT: subs.w r1, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r3, r1
+; V7A-T-NEXT: lsr.w r1, r3, r2
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: lsl.w r2, r3, r12
+; V7A-T-NEXT: subs.w lr, r12, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r3, r3, lr
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: bics r1, r3
+; V7A-T-NEXT: bics r0, r2
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bextr64_b2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: ldr r3, [r0]
+; V6M-NEXT: ldr r1, [r0, #4]
+; V6M-NEXT: mov r0, r3
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: mov r5, r1
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: ldr r2, [sp, #16]
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r4, r0
+; V6M-NEXT: bics r5, r1
+; V6M-NEXT: mov r0, r4
+; V6M-NEXT: mov r1, r5
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %val = load i64, ptr %w
+ %shifted = lshr i64 %val, %numskipbits
+ %notmask = shl i64 -1, %numlowbits
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr64_b3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: ldrd r12, r0, [r0]
+; V7M-NEXT: rsb.w r3, r1, #32
+; V7M-NEXT: lsl.w lr, r0, r3
+; V7M-NEXT: lsr.w r3, r12, r1
+; V7M-NEXT: orr.w r12, r3, lr
+; V7M-NEXT: subs.w r3, r1, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r12, r0, r3
+; V7M-NEXT: lsr.w r0, r0, r1
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: subs.w r1, r2, #32
+; V7M-NEXT: lsl.w r2, r3, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl r3, r1
+; V7M-NEXT: bic.w r1, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: bic.w r0, r12, r2
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bextr64_b3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldm r0, {r0, r3}
+; V7A-NEXT: lsr r12, r0, r1
+; V7A-NEXT: rsb r0, r1, #32
+; V7A-NEXT: orr r12, r12, r3, lsl r0
+; V7A-NEXT: subs r0, r1, #32
+; V7A-NEXT: lsrpl r12, r3, r0
+; V7A-NEXT: lsr r0, r3, r1
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: subs r1, r2, #32
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsl r2, r3, r2
+; V7A-NEXT: lslpl r3, r3, r1
+; V7A-NEXT: bic r1, r0, r3
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: bic r0, r12, r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_b3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: ldrd r12, r3, [r0]
+; V7A-T-NEXT: rsb.w r0, r1, #32
+; V7A-T-NEXT: lsl.w lr, r3, r0
+; V7A-T-NEXT: lsr.w r0, r12, r1
+; V7A-T-NEXT: orr.w r12, r0, lr
+; V7A-T-NEXT: subs.w r0, r1, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r12, r3, r0
+; V7A-T-NEXT: lsr.w r0, r3, r1
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: subs.w r1, r2, #32
+; V7A-T-NEXT: lsl.w r2, r3, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl r3, r1
+; V7A-T-NEXT: bic.w r1, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: bic.w r0, r12, r2
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bextr64_b3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, lr}
+; V6M-NEXT: push {r4, r5, r6, lr}
+; V6M-NEXT: mov r4, r2
+; V6M-NEXT: mov r2, r1
+; V6M-NEXT: ldr r3, [r0]
+; V6M-NEXT: ldr r1, [r0, #4]
+; V6M-NEXT: mov r0, r3
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: mov r6, r1
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r5, r0
+; V6M-NEXT: bics r6, r1
+; V6M-NEXT: mov r0, r5
+; V6M-NEXT: mov r1, r6
+; V6M-NEXT: pop {r4, r5, r6, pc}
+ %val = load i64, ptr %w
+ %skip = zext i8 %numskipbits to i64
+ %shifted = lshr i64 %val, %skip
+ %conv = zext i8 %numlowbits to i64
+ %notmask = shl i64 -1, %conv
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_b4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orrs r0, r3
+; V7M-NEXT: subs.w r3, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: lsl.w r3, r2, r12
+; V7M-NEXT: subs.w lr, r12, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r2, r2, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: bics r1, r2
+; V7M-NEXT: bics r0, r3
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bextr64_b4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsrpl r0, r1, r3
+; V7A-NEXT: lsr r1, r1, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: subs lr, r12, #32
+; V7A-NEXT: lsl r2, r3, r12
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: bic r0, r0, r2
+; V7A-NEXT: lslpl r3, r3, lr
+; V7A-NEXT: bic r1, r1, r3
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_b4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, r5, r7, lr}
+; V7A-T-NEXT: push {r4, r5, r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: ldr.w r12, [sp, #16]
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r5, r0, r3
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: subs.w lr, r12, #32
+; V7A-T-NEXT: lsl.w r0, r3, r12
+; V7A-T-NEXT: itt pl
+; V7A-T-NEXT: lslpl.w r3, r3, lr
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: subs.w r4, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r5, r1, r4
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: bic.w r0, r5, r0
+; V7A-T-NEXT: bics r1, r3
+; V7A-T-NEXT: pop {r4, r5, r7, pc}
+;
+; V6M-LABEL: bextr64_b4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: mov r5, r1
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: ldr r2, [sp, #16]
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r4, r0
+; V6M-NEXT: bics r5, r1
+; V6M-NEXT: mov r0, r4
+; V6M-NEXT: mov r1, r5
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %notmask = shl i64 -1, %numlowbits
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %shifted, %mask ; swapped order
+ ret i64 %masked
+}
+
+; 64-bit, but with 32-bit output
+
+; Everything done in 64-bit, truncation happens last.
+define i32 @bextr64_32_b0(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_b0:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldrb.w r1, [sp]
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: lsls r2, r1
+; V7M-NEXT: subs r1, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: bics r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_b0:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldrb r12, [sp]
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: lsl r1, r1, r12
+; V7A-NEXT: subs r2, r12, #32
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: bic r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_32_b0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsr.w r12, r0, r2
+; V7A-T-NEXT: rsb.w r0, r2, #32
+; V7A-T-NEXT: ldrb.w r3, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r0, r1, r0
+; V7A-T-NEXT: orr.w r0, r0, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: mov.w r1, #-1
+; V7A-T-NEXT: lsls r1, r3
+; V7A-T-NEXT: subs.w r2, r3, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: bics r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_b0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: add r1, sp, #8
+; V6M-NEXT: ldrb r2, [r1]
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r4, r0
+; V6M-NEXT: mov r0, r4
+; V6M-NEXT: pop {r4, pc}
+ %shiftedval = lshr i64 %val, %numskipbits
+ %widenumlowbits = zext i8 %numlowbits to i64
+ %notmask = shl nsw i64 -1, %widenumlowbits
+ %mask = xor i64 %notmask, -1
+ %wideres = and i64 %shiftedval, %mask
+ %res = trunc i64 %wideres to i32
+ ret i32 %res
+}
+
+; Shifting happens in 64-bit, then truncation. Masking is 32-bit.
+define i32 @bextr64_32_b1(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_b1:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldrb.w r1, [sp]
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: bics r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_b1:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldrb r12, [sp]
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: bic r0, r0, r1, lsl r12
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_32_b1:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldrb.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: mov.w r1, #-1
+; V7A-T-NEXT: lsl.w r1, r1, r12
+; V7A-T-NEXT: bics r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_b1:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r7, lr}
+; V6M-NEXT: push {r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: add r1, sp, #8
+; V6M-NEXT: ldrb r1, [r1]
+; V6M-NEXT: movs r2, #0
+; V6M-NEXT: mvns r2, r2
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: bics r0, r2
+; V6M-NEXT: pop {r7, pc}
+ %shiftedval = lshr i64 %val, %numskipbits
+ %truncshiftedval = trunc i64 %shiftedval to i32
+ %widenumlowbits = zext i8 %numlowbits to i32
+ %notmask = shl nsw i32 -1, %widenumlowbits
+ %mask = xor i32 %notmask, -1
+ %res = and i32 %truncshiftedval, %mask
+ ret i32 %res
+}
+
+; Shifting happens in 64-bit. Mask is 32-bit, but extended to 64-bit.
+; Masking is 64-bit. Then truncation.
+define i32 @bextr64_32_b2(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_b2:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldrb.w r1, [sp]
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: bics r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_b2:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldrb r12, [sp]
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: bic r0, r0, r1, lsl r12
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_32_b2:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldrb.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: mov.w r1, #-1
+; V7A-T-NEXT: lsl.w r1, r1, r12
+; V7A-T-NEXT: bics r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_b2:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r7, lr}
+; V6M-NEXT: push {r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: add r1, sp, #8
+; V6M-NEXT: ldrb r1, [r1]
+; V6M-NEXT: movs r2, #0
+; V6M-NEXT: mvns r2, r2
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: bics r0, r2
+; V6M-NEXT: pop {r7, pc}
+ %shiftedval = lshr i64 %val, %numskipbits
+ %widenumlowbits = zext i8 %numlowbits to i32
+ %notmask = shl nsw i32 -1, %widenumlowbits
+ %mask = xor i32 %notmask, -1
+ %zextmask = zext i32 %mask to i64
+ %wideres = and i64 %shiftedval, %zextmask
+ %res = trunc i64 %wideres to i32
+ ret i32 %res
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern c. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_c0:
+; V7M: @ %bb.0:
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_c0:
+; V7A: @ %bb.0:
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_c0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_c0:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r3, #32
+; V6M-NEXT: subs r2, r3, r2
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: lsls r0, r2
+; V6M-NEXT: lsrs r0, r2
+; V6M-NEXT: bx lr
+ %shifted = lshr i32 %val, %numskipbits
+ %numhighbits = sub i32 32, %numlowbits
+ %mask = lshr i32 -1, %numhighbits
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_c1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_c1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_c1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_c1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #32
+; V6M-NEXT: subs r1, r1, r2
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %skip = zext i8 %numskipbits to i32
+ %shifted = lshr i32 %val, %skip
+ %numhighbits = sub i8 32, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i32
+ %mask = lshr i32 -1, %sh_prom
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_c2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_c2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_c2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_c2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_c2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r3, #32
+; V6M-NEXT: subs r2, r3, r2
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: lsls r0, r2
+; V6M-NEXT: lsrs r0, r2
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %shifted = lshr i32 %val, %numskipbits
+ %numhighbits = sub i32 32, %numlowbits
+ %mask = lshr i32 -1, %numhighbits
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_c3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_c3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_c3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_c3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #32
+; V6M-NEXT: subs r1, r1, r2
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %skip = zext i8 %numskipbits to i32
+ %shifted = lshr i32 %val, %skip
+ %numhighbits = sub i8 32, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i32
+ %mask = lshr i32 -1, %sh_prom
+ %masked = and i32 %mask, %shifted
+ ret i32 %masked
+}
+
+define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_c4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_c4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_c4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_c4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r3, #32
+; V6M-NEXT: subs r2, r3, r2
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: lsls r0, r2
+; V6M-NEXT: lsrs r0, r2
+; V6M-NEXT: bx lr
+ %shifted = lshr i32 %val, %numskipbits
+ %numhighbits = sub i32 32, %numlowbits
+ %mask = lshr i32 -1, %numhighbits
+ %masked = and i32 %shifted, %mask ; swapped order
+ ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_c0:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: ldr.w r12, [sp]
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orrs r0, r3
+; V7M-NEXT: subs.w r3, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: rsb.w r3, r12, #64
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: lsr.w r3, r2, r3
+; V7M-NEXT: rsbs.w r12, r12, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r2, r2, r12
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_c0:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r5, r11, lr}
+; V7A-NEXT: push {r4, r5, r11, lr}
+; V7A-NEXT: ldr r12, [sp, #16]
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsr r5, r1, r2
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: rsb r4, r12, #64
+; V7A-NEXT: rsbs lr, r12, #32
+; V7A-NEXT: lsr r4, r3, r4
+; V7A-NEXT: lsrpl r3, r3, lr
+; V7A-NEXT: movwpl r4, #0
+; V7A-NEXT: subs lr, r2, #32
+; V7A-NEXT: rsb r2, r2, #32
+; V7A-NEXT: movwpl r5, #0
+; V7A-NEXT: and r12, r4, r5
+; V7A-NEXT: orr r0, r0, r1, lsl r2
+; V7A-NEXT: lsrpl r0, r1, lr
+; V7A-NEXT: mov r1, r12
+; V7A-NEXT: and r0, r3, r0
+; V7A-NEXT: pop {r4, r5, r11, pc}
+;
+; V7A-T-LABEL: bextr64_c0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: mov.w lr, #-1
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orrs r0, r3
+; V7A-T-NEXT: subs.w r3, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r3
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: rsbs.w r2, r12, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl r3, r2
+; V7A-T-NEXT: rsb.w r2, r12, #64
+; V7A-T-NEXT: and.w r0, r0, r3
+; V7A-T-NEXT: lsr.w r2, lr, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: ands r1, r2
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bextr64_c0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: ldr r0, [sp, #16]
+; V6M-NEXT: movs r1, #64
+; V6M-NEXT: subs r2, r1, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %numhighbits = sub i64 64, %numlowbits
+ %mask = lshr i64 -1, %numhighbits
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_c1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: uxtb r2, r2
+; V7M-NEXT: lsr.w r12, r0, r2
+; V7M-NEXT: rsb.w r0, r2, #32
+; V7M-NEXT: lsl.w r0, r1, r0
+; V7M-NEXT: orr.w r12, r12, r0
+; V7M-NEXT: subs.w r0, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r12, r1, r0
+; V7M-NEXT: rsb.w r0, r3, #64
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: uxtb r0, r0
+; V7M-NEXT: subs.w lr, r0, #32
+; V7M-NEXT: lsr.w r2, r3, r0
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r3, r3, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: and.w r0, r3, r12
+; V7M-NEXT: ands r1, r2
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bextr64_c1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, lr}
+; V7A-NEXT: push {r4, lr}
+; V7A-NEXT: uxtb r12, r2
+; V7A-NEXT: lsr lr, r0, r12
+; V7A-NEXT: rsb r0, r12, #32
+; V7A-NEXT: orr r4, lr, r1, lsl r0
+; V7A-NEXT: mvn lr, #31
+; V7A-NEXT: uxtab r2, lr, r2
+; V7A-NEXT: cmp r2, #0
+; V7A-NEXT: lsrpl r4, r1, r2
+; V7A-NEXT: rsb r2, r3, #64
+; V7A-NEXT: lsr r1, r1, r12
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: uxtb r12, r2
+; V7A-NEXT: uxtab r2, lr, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: lsr r0, r3, r12
+; V7A-NEXT: cmp r2, #0
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: and r1, r0, r1
+; V7A-NEXT: lsrpl r3, r3, r2
+; V7A-NEXT: and r0, r3, r4
+; V7A-NEXT: pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_c1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: uxtb.w r12, r2
+; V7A-T-NEXT: lsr.w lr, r0, r12
+; V7A-T-NEXT: rsb.w r0, r12, #32
+; V7A-T-NEXT: lsl.w r0, r1, r0
+; V7A-T-NEXT: orr.w r4, lr, r0
+; V7A-T-NEXT: mvn lr, #31
+; V7A-T-NEXT: uxtab r2, lr, r2
+; V7A-T-NEXT: cmp r2, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r4, r1, r2
+; V7A-T-NEXT: rsb.w r2, r3, #64
+; V7A-T-NEXT: lsr.w r1, r1, r12
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: uxtb.w r12, r2
+; V7A-T-NEXT: uxtab r2, lr, r2
+; V7A-T-NEXT: lsr.w r0, r3, r12
+; V7A-T-NEXT: cmp r2, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: and.w r1, r1, r0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl r3, r2
+; V7A-T-NEXT: and.w r0, r3, r4
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_c1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, lr}
+; V6M-NEXT: push {r4, r5, r6, lr}
+; V6M-NEXT: mov r5, r3
+; V6M-NEXT: uxtb r2, r2
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r6, r0
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: movs r0, #64
+; V6M-NEXT: subs r0, r0, r5
+; V6M-NEXT: uxtb r2, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r6
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r6, pc}
+ %skip = zext i8 %numskipbits to i64
+ %shifted = lshr i64 %val, %skip
+ %numhighbits = sub i8 64, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i64
+ %mask = lshr i64 -1, %sh_prom
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_c2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_c2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldrd r0, r3, [r0]
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: ldr.w r12, [sp]
+; V7M-NEXT: lsl.w r1, r3, r1
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: orrs r0, r1
+; V7M-NEXT: subs.w r1, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r3, r1
+; V7M-NEXT: lsr.w r1, r3, r2
+; V7M-NEXT: rsb.w r3, r12, #64
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: rsbs.w r12, r12, #32
+; V7M-NEXT: lsr.w r3, r2, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r2, r2, r12
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_c2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r6, r8, lr}
+; V7A-NEXT: push {r4, r6, r8, lr}
+; V7A-NEXT: ldr r12, [sp, #16]
+; V7A-NEXT: ldr r3, [r0, #4]
+; V7A-NEXT: rsb r6, r12, #64
+; V7A-NEXT: ldr r8, [r0]
+; V7A-NEXT: mvn r0, #0
+; V7A-NEXT: rsbs r1, r12, #32
+; V7A-NEXT: lsr r6, r0, r6
+; V7A-NEXT: lsr r4, r3, r2
+; V7A-NEXT: lsrpl r0, r0, r1
+; V7A-NEXT: movwpl r6, #0
+; V7A-NEXT: subs r12, r2, #32
+; V7A-NEXT: movwpl r4, #0
+; V7A-NEXT: and r1, r6, r4
+; V7A-NEXT: lsr r6, r8, r2
+; V7A-NEXT: rsb r2, r2, #32
+; V7A-NEXT: orr r2, r6, r3, lsl r2
+; V7A-NEXT: lsrpl r2, r3, r12
+; V7A-NEXT: and r0, r0, r2
+; V7A-NEXT: pop {r4, r6, r8, pc}
+;
+; V7A-T-LABEL: bextr64_c2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldrd r0, r3, [r0]
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: ldr.w r12, [sp]
+; V7A-T-NEXT: lsl.w r1, r3, r1
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: orrs r0, r1
+; V7A-T-NEXT: subs.w r1, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r3, r1
+; V7A-T-NEXT: lsr.w r1, r3, r2
+; V7A-T-NEXT: rsb.w r2, r12, #64
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: rsbs.w r12, r12, #32
+; V7A-T-NEXT: lsr.w r2, r3, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r3, r3, r12
+; V7A-T-NEXT: ands r1, r2
+; V7A-T-NEXT: ands r0, r3
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_c2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: ldr r3, [r0]
+; V6M-NEXT: ldr r1, [r0, #4]
+; V6M-NEXT: mov r0, r3
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: ldr r0, [sp, #16]
+; V6M-NEXT: movs r1, #64
+; V6M-NEXT: subs r2, r1, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %val = load i64, ptr %w
+ %shifted = lshr i64 %val, %numskipbits
+ %numhighbits = sub i64 64, %numlowbits
+ %mask = lshr i64 -1, %numhighbits
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_c3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: ldrd r0, r3, [r0]
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsr.w r12, r0, r1
+; V7M-NEXT: rsb.w r0, r1, #32
+; V7M-NEXT: lsl.w r0, r3, r0
+; V7M-NEXT: orr.w r12, r12, r0
+; V7M-NEXT: subs.w r0, r1, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r12, r3, r0
+; V7M-NEXT: rsb.w r0, r2, #64
+; V7M-NEXT: lsr.w r1, r3, r1
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: uxtb r0, r0
+; V7M-NEXT: subs.w lr, r0, #32
+; V7M-NEXT: lsr.w r2, r3, r0
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r3, r3, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: and.w r0, r3, r12
+; V7M-NEXT: ands r1, r2
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bextr64_c3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, lr}
+; V7A-NEXT: push {r4, lr}
+; V7A-NEXT: ldr r4, [r0]
+; V7A-NEXT: ldr r3, [r0, #4]
+; V7A-NEXT: uxtb r0, r1
+; V7A-NEXT: lsr r12, r4, r0
+; V7A-NEXT: rsb r4, r0, #32
+; V7A-NEXT: lsr r0, r3, r0
+; V7A-NEXT: orr lr, r12, r3, lsl r4
+; V7A-NEXT: mvn r12, #31
+; V7A-NEXT: uxtab r1, r12, r1
+; V7A-NEXT: cmp r1, #0
+; V7A-NEXT: lsrpl lr, r3, r1
+; V7A-NEXT: rsb r1, r2, #64
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: uxtb r2, r1
+; V7A-NEXT: uxtab r4, r12, r1
+; V7A-NEXT: lsr r2, r3, r2
+; V7A-NEXT: cmp r4, #0
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: and r1, r2, r0
+; V7A-NEXT: lsrpl r3, r3, r4
+; V7A-NEXT: and r0, r3, lr
+; V7A-NEXT: pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_c3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, r5, r7, lr}
+; V7A-T-NEXT: push {r4, r5, r7, lr}
+; V7A-T-NEXT: ldrd r12, lr, [r0]
+; V7A-T-NEXT: uxtb r0, r1
+; V7A-T-NEXT: rsb.w r3, r0, #32
+; V7A-T-NEXT: lsl.w r4, lr, r3
+; V7A-T-NEXT: lsr.w r3, r12, r0
+; V7A-T-NEXT: orr.w r5, r3, r4
+; V7A-T-NEXT: mvn r12, #31
+; V7A-T-NEXT: uxtab r1, r12, r1
+; V7A-T-NEXT: lsr.w r0, lr, r0
+; V7A-T-NEXT: cmp r1, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r5, lr, r1
+; V7A-T-NEXT: rsb.w r1, r2, #64
+; V7A-T-NEXT: mov.w r4, #-1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: uxtb r2, r1
+; V7A-T-NEXT: uxtab r3, r12, r1
+; V7A-T-NEXT: lsr.w r2, r4, r2
+; V7A-T-NEXT: cmp r3, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: and.w r1, r2, r0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl r4, r3
+; V7A-T-NEXT: and.w r0, r4, r5
+; V7A-T-NEXT: pop {r4, r5, r7, pc}
+;
+; V6M-LABEL: bextr64_c3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, lr}
+; V6M-NEXT: push {r4, r5, r6, lr}
+; V6M-NEXT: mov r5, r2
+; V6M-NEXT: ldr r4, [r0]
+; V6M-NEXT: ldr r3, [r0, #4]
+; V6M-NEXT: uxtb r2, r1
+; V6M-NEXT: mov r0, r4
+; V6M-NEXT: mov r1, r3
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r6, r0
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: movs r0, #64
+; V6M-NEXT: subs r0, r0, r5
+; V6M-NEXT: uxtb r2, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r6
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r6, pc}
+ %val = load i64, ptr %w
+ %skip = zext i8 %numskipbits to i64
+ %shifted = lshr i64 %val, %skip
+ %numhighbits = sub i8 64, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i64
+ %mask = lshr i64 -1, %sh_prom
+ %masked = and i64 %mask, %shifted
+ ret i64 %masked
+}
+
+define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_c4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: ldr.w r12, [sp]
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orrs r0, r3
+; V7M-NEXT: subs.w r3, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: rsb.w r3, r12, #64
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: lsr.w r3, r2, r3
+; V7M-NEXT: rsbs.w r12, r12, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r2, r2, r12
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_c4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r5, r11, lr}
+; V7A-NEXT: push {r4, r5, r11, lr}
+; V7A-NEXT: ldr r12, [sp, #16]
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsr r5, r1, r2
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: rsb r4, r12, #64
+; V7A-NEXT: rsbs lr, r12, #32
+; V7A-NEXT: lsr r4, r3, r4
+; V7A-NEXT: lsrpl r3, r3, lr
+; V7A-NEXT: movwpl r4, #0
+; V7A-NEXT: subs lr, r2, #32
+; V7A-NEXT: rsb r2, r2, #32
+; V7A-NEXT: movwpl r5, #0
+; V7A-NEXT: and r12, r5, r4
+; V7A-NEXT: orr r0, r0, r1, lsl r2
+; V7A-NEXT: lsrpl r0, r1, lr
+; V7A-NEXT: mov r1, r12
+; V7A-NEXT: and r0, r0, r3
+; V7A-NEXT: pop {r4, r5, r11, pc}
+;
+; V7A-T-LABEL: bextr64_c4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: mov.w lr, #-1
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orrs r0, r3
+; V7A-T-NEXT: subs.w r3, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r3
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: rsbs.w r2, r12, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl r3, r2
+; V7A-T-NEXT: rsb.w r2, r12, #64
+; V7A-T-NEXT: and.w r0, r0, r3
+; V7A-T-NEXT: lsr.w r2, lr, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: ands r1, r2
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bextr64_c4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: ldr r0, [sp, #16]
+; V6M-NEXT: movs r1, #64
+; V6M-NEXT: subs r2, r1, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %numhighbits = sub i64 64, %numlowbits
+ %mask = lshr i64 -1, %numhighbits
+ %masked = and i64 %shifted, %mask ; swapped order
+ ret i64 %masked
+}
+
+; 64-bit, but with 32-bit output
+
+; Everything done in 64-bit, truncation happens last.
+define i32 @bextr64_32_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_c0:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldr r1, [sp]
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: rsbs.w r1, r1, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl r2, r1
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_c0:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r3, [sp]
+; V7A-NEXT: rsbs r12, r3, #32
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsrpl r3, r3, r12
+; V7A-NEXT: lsr r12, r0, r2
+; V7A-NEXT: rsb r0, r2, #32
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r12, r1, lsl r0
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: and r0, r3, r0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_32_c0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: rsbs.w r1, r12, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl r2, r1
+; V7A-T-NEXT: ands r0, r2
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_c0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: ldr r0, [sp, #8]
+; V6M-NEXT: movs r1, #64
+; V6M-NEXT: subs r2, r1, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r4
+; V6M-NEXT: pop {r4, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %numhighbits = sub i64 64, %numlowbits
+ %mask = lshr i64 -1, %numhighbits
+ %masked = and i64 %mask, %shifted
+ %res = trunc i64 %masked to i32
+ ret i32 %res
+}
+
+; Shifting happens in 64-bit, then truncation. Masking is 32-bit.
+define i32 @bextr64_32_c1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_c1:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldr r1, [sp]
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_c1:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldr r12, [sp]
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: rsb r1, r12, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_32_c1:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: rsb.w r1, r12, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_c1:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r7, lr}
+; V6M-NEXT: push {r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldr r1, [sp, #8]
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: pop {r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %truncshifted = trunc i64 %shifted to i32
+ %numhighbits = sub i32 32, %numlowbits
+ %mask = lshr i32 -1, %numhighbits
+ %masked = and i32 %mask, %truncshifted
+ ret i32 %masked
+}
+
+; Shifting happens in 64-bit. Mask is 32-bit, but extended to 64-bit.
+; Masking is 64-bit. Then truncation.
+define i32 @bextr64_32_c2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_c2:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldr r1, [sp]
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_c2:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldr r12, [sp]
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: rsb r1, r12, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_32_c2:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: rsb.w r1, r12, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_c2:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r7, lr}
+; V6M-NEXT: push {r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldr r1, [sp, #8]
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: pop {r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %numhighbits = sub i32 32, %numlowbits
+ %mask = lshr i32 -1, %numhighbits
+ %zextmask = zext i32 %mask to i64
+ %masked = and i64 %zextmask, %shifted
+ %truncmasked = trunc i64 %masked to i32
+ ret i32 %truncmasked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern d. 32-bit.
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_d0:
+; V7M: @ %bb.0:
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_d0:
+; V7A: @ %bb.0:
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_d0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_d0:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r3, #32
+; V6M-NEXT: subs r2, r3, r2
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: lsls r0, r2
+; V6M-NEXT: lsrs r0, r2
+; V6M-NEXT: bx lr
+ %shifted = lshr i32 %val, %numskipbits
+ %numhighbits = sub i32 32, %numlowbits
+ %highbitscleared = shl i32 %shifted, %numhighbits
+ %masked = lshr i32 %highbitscleared, %numhighbits
+ ret i32 %masked
+}
+
+define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_d1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_d1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_d1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_d1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #32
+; V6M-NEXT: subs r1, r1, r2
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %skip = zext i8 %numskipbits to i32
+ %shifted = lshr i32 %val, %skip
+ %numhighbits = sub i8 32, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i32
+ %highbitscleared = shl i32 %shifted, %sh_prom
+ %masked = lshr i32 %highbitscleared, %sh_prom
+ ret i32 %masked
+}
+
+define i32 @bextr32_d2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_d2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_d2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_d2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_d2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r3, #32
+; V6M-NEXT: subs r2, r3, r2
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: lsls r0, r2
+; V6M-NEXT: lsrs r0, r2
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %shifted = lshr i32 %val, %numskipbits
+ %numhighbits = sub i32 32, %numlowbits
+ %highbitscleared = shl i32 %shifted, %numhighbits
+ %masked = lshr i32 %highbitscleared, %numhighbits
+ ret i32 %masked
+}
+
+define i32 @bextr32_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_d3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr32_d3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: rsb r1, r2, #32
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr32_d3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr32_d3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: movs r1, #32
+; V6M-NEXT: subs r1, r1, r2
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %skip = zext i8 %numskipbits to i32
+ %shifted = lshr i32 %val, %skip
+ %numhighbits = sub i8 32, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i32
+ %highbitscleared = shl i32 %shifted, %sh_prom
+ %masked = lshr i32 %highbitscleared, %sh_prom
+ ret i32 %masked
+}
+
+; 64-bit.
+
+define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_d0:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orrs r0, r3
+; V7M-NEXT: subs.w r3, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: rsb.w r3, r12, #64
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: rsb.w lr, r12, #32
+; V7M-NEXT: rsb.w r12, r3, #32
+; V7M-NEXT: lsls r1, r3
+; V7M-NEXT: cmp.w lr, #0
+; V7M-NEXT: lsr.w r4, r0, r12
+; V7M-NEXT: orr.w r1, r1, r4
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, r0, lr
+; V7M-NEXT: lsl.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsl.w r2, r1, r12
+; V7M-NEXT: lsr.w r0, r0, r3
+; V7M-NEXT: orr.w r0, r0, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, lr
+; V7M-NEXT: lsr.w r1, r1, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_d0:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: lsr r3, r1, r2
+; V7A-NEXT: subs lr, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: rsb r2, r2, #32
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: movwpl r3, #0
+; V7A-NEXT: orr r0, r0, r1, lsl r2
+; V7A-NEXT: lsrpl r0, r1, lr
+; V7A-NEXT: rsb r1, r12, #64
+; V7A-NEXT: rsb lr, r1, #32
+; V7A-NEXT: lsr r2, r0, lr
+; V7A-NEXT: orr r2, r2, r3, lsl r1
+; V7A-NEXT: rsbs r3, r12, #32
+; V7A-NEXT: lslpl r2, r0, r3
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: lsr r1, r2, r1
+; V7A-NEXT: orr r0, r0, r2, lsl lr
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: lsrpl r0, r2, r3
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_d0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orrs r0, r3
+; V7A-T-NEXT: subs.w r3, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r3
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: rsb.w r3, r12, #64
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: rsb.w lr, r3, #32
+; V7A-T-NEXT: lsls r1, r3
+; V7A-T-NEXT: rsbs.w r2, r12, #32
+; V7A-T-NEXT: lsr.w r4, r0, lr
+; V7A-T-NEXT: orr.w r1, r1, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r1, r0, r2
+; V7A-T-NEXT: lsl.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsl.w r4, r1, lr
+; V7A-T-NEXT: lsr.w r0, r0, r3
+; V7A-T-NEXT: orr.w r0, r0, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: lsr.w r1, r1, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_d0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldr r2, [sp, #8]
+; V6M-NEXT: movs r3, #64
+; V6M-NEXT: subs r4, r3, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %numhighbits = sub i64 64, %numlowbits
+ %highbitscleared = shl i64 %shifted, %numhighbits
+ %masked = lshr i64 %highbitscleared, %numhighbits
+ ret i64 %masked
+}
+
+define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_d1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: uxtb.w lr, r2
+; V7M-NEXT: subs.w r2, lr, #32
+; V7M-NEXT: lsr.w r12, r0, lr
+; V7M-NEXT: rsb.w r0, lr, #32
+; V7M-NEXT: lsl.w r0, r1, r0
+; V7M-NEXT: orr.w r0, r0, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: rsb.w r2, r3, #64
+; V7M-NEXT: lsr.w r1, r1, lr
+; V7M-NEXT: uxtb r2, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: rsb.w r12, r2, #32
+; V7M-NEXT: lsls r1, r2
+; V7M-NEXT: sub.w r3, r2, #32
+; V7M-NEXT: lsr.w r4, r0, r12
+; V7M-NEXT: orrs r1, r4
+; V7M-NEXT: cmp r3, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, r0, r3
+; V7M-NEXT: lsl.w r0, r0, r2
+; V7M-NEXT: lsl.w r4, r1, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsr.w r0, r0, r2
+; V7M-NEXT: orr.w r0, r0, r4
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_d1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r5, r11, lr}
+; V7A-NEXT: push {r4, r5, r11, lr}
+; V7A-NEXT: uxtb r12, r2
+; V7A-NEXT: lsr lr, r0, r12
+; V7A-NEXT: rsb r0, r12, #32
+; V7A-NEXT: orr r0, lr, r1, lsl r0
+; V7A-NEXT: mvn lr, #31
+; V7A-NEXT: uxtab r2, lr, r2
+; V7A-NEXT: cmp r2, #0
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: rsb r2, r3, #64
+; V7A-NEXT: lsr r1, r1, r12
+; V7A-NEXT: uxtb r3, r2
+; V7A-NEXT: rsb r4, r3, #32
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: uxtab r2, lr, r2
+; V7A-NEXT: lsr r5, r0, r4
+; V7A-NEXT: orr r1, r5, r1, lsl r3
+; V7A-NEXT: cmp r2, #0
+; V7A-NEXT: lslpl r1, r0, r2
+; V7A-NEXT: lsl r0, r0, r3
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lsr r0, r0, r3
+; V7A-NEXT: orr r0, r0, r1, lsl r4
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: lsr r1, r1, r3
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: pop {r4, r5, r11, pc}
+;
+; V7A-T-LABEL: bextr64_d1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, r5, r6, r7, lr}
+; V7A-T-NEXT: push {r4, r5, r6, r7, lr}
+; V7A-T-NEXT: uxtb.w r12, r2
+; V7A-T-NEXT: rsb.w r6, r12, #32
+; V7A-T-NEXT: rsb.w r3, r3, #64
+; V7A-T-NEXT: lsr.w r0, r0, r12
+; V7A-T-NEXT: mvn r7, #31
+; V7A-T-NEXT: uxtab r2, r7, r2
+; V7A-T-NEXT: lsl.w r6, r1, r6
+; V7A-T-NEXT: lsr.w lr, r1, r12
+; V7A-T-NEXT: orrs r0, r6
+; V7A-T-NEXT: cmp r2, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl.w lr, #0
+; V7A-T-NEXT: uxtb r5, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: rsb.w r1, r5, #32
+; V7A-T-NEXT: uxtab r3, r7, r3
+; V7A-T-NEXT: lsl.w r4, lr, r5
+; V7A-T-NEXT: lsr.w r2, r0, r1
+; V7A-T-NEXT: cmp r3, #0
+; V7A-T-NEXT: orr.w r2, r2, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r2, r0, r3
+; V7A-T-NEXT: lsl.w r0, r0, r5
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: lsr.w r0, r0, r5
+; V7A-T-NEXT: orr.w r0, r0, r1
+; V7A-T-NEXT: lsr.w r1, r2, r5
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r2, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: pop {r4, r5, r6, r7, pc}
+;
+; V6M-LABEL: bextr64_d1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: mov r4, r3
+; V6M-NEXT: uxtb r2, r2
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: movs r2, #64
+; V6M-NEXT: subs r2, r2, r4
+; V6M-NEXT: uxtb r4, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, pc}
+ %skip = zext i8 %numskipbits to i64
+ %shifted = lshr i64 %val, %skip
+ %numhighbits = sub i8 64, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i64
+ %highbitscleared = shl i64 %shifted, %sh_prom
+ %masked = lshr i64 %highbitscleared, %sh_prom
+ ret i64 %masked
+}
+
+define i64 @bextr64_d2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_d2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: ldrd r0, r3, [r0]
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: lsl.w r1, r3, r1
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: rsb.w lr, r12, #32
+; V7M-NEXT: orrs r0, r1
+; V7M-NEXT: subs.w r1, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r3, r1
+; V7M-NEXT: rsb.w r1, r12, #64
+; V7M-NEXT: lsr.w r2, r3, r2
+; V7M-NEXT: rsb.w r12, r1, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: cmp.w lr, #0
+; V7M-NEXT: lsl.w r2, r2, r1
+; V7M-NEXT: lsr.w r4, r0, r12
+; V7M-NEXT: orr.w r2, r2, r4
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r2, r0, lr
+; V7M-NEXT: lsl.w r0, r0, r1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsl.w r3, r2, r12
+; V7M-NEXT: lsr.w r0, r0, r1
+; V7M-NEXT: lsr.w r1, r2, r1
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r2, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_d2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: ldrd r0, r1, [r0]
+; V7A-NEXT: subs lr, r2, #32
+; V7A-NEXT: lsr r3, r1, r2
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: movwpl r3, #0
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: rsb r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r2
+; V7A-NEXT: lsrpl r0, r1, lr
+; V7A-NEXT: rsb r1, r12, #64
+; V7A-NEXT: rsb lr, r1, #32
+; V7A-NEXT: lsr r2, r0, lr
+; V7A-NEXT: orr r2, r2, r3, lsl r1
+; V7A-NEXT: rsbs r3, r12, #32
+; V7A-NEXT: lslpl r2, r0, r3
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: lsr r1, r2, r1
+; V7A-NEXT: orr r0, r0, r2, lsl lr
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: lsrpl r0, r2, r3
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_d2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: ldrd r0, r3, [r0]
+; V7A-T-NEXT: rsb.w r1, r2, #32
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: lsl.w r1, r3, r1
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: orrs r0, r1
+; V7A-T-NEXT: subs.w r1, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r3, r1
+; V7A-T-NEXT: lsr.w r2, r3, r2
+; V7A-T-NEXT: rsb.w r1, r12, #64
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: rsb.w lr, r1, #32
+; V7A-T-NEXT: rsbs.w r3, r12, #32
+; V7A-T-NEXT: lsl.w r2, r2, r1
+; V7A-T-NEXT: lsr.w r4, r0, lr
+; V7A-T-NEXT: orr.w r2, r2, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r2, r0, r3
+; V7A-T-NEXT: lsl.w r0, r0, r1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsl.w r4, r2, lr
+; V7A-T-NEXT: lsr.w r0, r0, r1
+; V7A-T-NEXT: lsr.w r1, r2, r1
+; V7A-T-NEXT: orr.w r0, r0, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r2, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_d2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: ldr r3, [r0]
+; V6M-NEXT: ldr r1, [r0, #4]
+; V6M-NEXT: mov r0, r3
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldr r2, [sp, #8]
+; V6M-NEXT: movs r3, #64
+; V6M-NEXT: subs r4, r3, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, pc}
+ %val = load i64, ptr %w
+ %shifted = lshr i64 %val, %numskipbits
+ %numhighbits = sub i64 64, %numlowbits
+ %highbitscleared = shl i64 %shifted, %numhighbits
+ %masked = lshr i64 %highbitscleared, %numhighbits
+ ret i64 %masked
+}
+
+define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_d3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: ldrd r0, lr, [r0]
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: rsb.w r2, r2, #64
+; V7M-NEXT: subs.w r3, r1, #32
+; V7M-NEXT: lsr.w r12, r0, r1
+; V7M-NEXT: rsb.w r0, r1, #32
+; V7M-NEXT: lsr.w r1, lr, r1
+; V7M-NEXT: uxtb r2, r2
+; V7M-NEXT: lsl.w r0, lr, r0
+; V7M-NEXT: orr.w r0, r0, r12
+; V7M-NEXT: rsb.w r12, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, lr, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: lsls r1, r2
+; V7M-NEXT: sub.w r3, r2, #32
+; V7M-NEXT: lsr.w r4, r0, r12
+; V7M-NEXT: orrs r1, r4
+; V7M-NEXT: cmp r3, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, r0, r3
+; V7M-NEXT: lsl.w r0, r0, r2
+; V7M-NEXT: lsl.w r4, r1, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsr.w r0, r0, r2
+; V7M-NEXT: orr.w r0, r0, r4
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_d3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r5, r11, lr}
+; V7A-NEXT: push {r4, r5, r11, lr}
+; V7A-NEXT: ldr r4, [r0]
+; V7A-NEXT: ldr r3, [r0, #4]
+; V7A-NEXT: uxtb r0, r1
+; V7A-NEXT: lsr r12, r4, r0
+; V7A-NEXT: rsb r4, r0, #32
+; V7A-NEXT: lsr r0, r3, r0
+; V7A-NEXT: orr r4, r12, r3, lsl r4
+; V7A-NEXT: mvn r12, #31
+; V7A-NEXT: uxtab r1, r12, r1
+; V7A-NEXT: cmp r1, #0
+; V7A-NEXT: lsrpl r4, r3, r1
+; V7A-NEXT: rsb r1, r2, #64
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: uxtb r2, r1
+; V7A-NEXT: rsb lr, r2, #32
+; V7A-NEXT: uxtab r1, r12, r1
+; V7A-NEXT: lsr r5, r4, lr
+; V7A-NEXT: orr r3, r5, r0, lsl r2
+; V7A-NEXT: cmp r1, #0
+; V7A-NEXT: lsl r0, r4, r2
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lslpl r3, r4, r1
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: orr r0, r0, r3, lsl lr
+; V7A-NEXT: lsrpl r0, r3, r1
+; V7A-NEXT: lsr r1, r3, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: pop {r4, r5, r11, pc}
+;
+; V7A-T-LABEL: bextr64_d3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, r5, r6, lr}
+; V7A-T-NEXT: push {r4, r5, r6, lr}
+; V7A-T-NEXT: ldrd r12, lr, [r0]
+; V7A-T-NEXT: uxtb r0, r1
+; V7A-T-NEXT: rsb.w r6, r0, #32
+; V7A-T-NEXT: lsr.w r3, lr, r0
+; V7A-T-NEXT: rsb.w r2, r2, #64
+; V7A-T-NEXT: mvn r4, #31
+; V7A-T-NEXT: lsr.w r0, r12, r0
+; V7A-T-NEXT: uxtab r1, r4, r1
+; V7A-T-NEXT: lsl.w r6, lr, r6
+; V7A-T-NEXT: orrs r0, r6
+; V7A-T-NEXT: cmp r1, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r3, #0
+; V7A-T-NEXT: uxtb r5, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, lr, r1
+; V7A-T-NEXT: rsb.w r1, r5, #32
+; V7A-T-NEXT: lsls r3, r5
+; V7A-T-NEXT: uxtab r2, r4, r2
+; V7A-T-NEXT: lsr.w r6, r0, r1
+; V7A-T-NEXT: orrs r3, r6
+; V7A-T-NEXT: cmp r2, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r3, r0, r2
+; V7A-T-NEXT: lsl.w r0, r0, r5
+; V7A-T-NEXT: lsl.w r1, r3, r1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsr.w r0, r0, r5
+; V7A-T-NEXT: orr.w r0, r0, r1
+; V7A-T-NEXT: lsr.w r1, r3, r5
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r3, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: pop {r4, r5, r6, pc}
+;
+; V6M-LABEL: bextr64_d3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r2
+; V6M-NEXT: ldr r5, [r0]
+; V6M-NEXT: ldr r3, [r0, #4]
+; V6M-NEXT: uxtb r2, r1
+; V6M-NEXT: mov r0, r5
+; V6M-NEXT: mov r1, r3
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: movs r2, #64
+; V6M-NEXT: subs r2, r2, r4
+; V6M-NEXT: uxtb r4, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %val = load i64, ptr %w
+ %skip = zext i8 %numskipbits to i64
+ %shifted = lshr i64 %val, %skip
+ %numhighbits = sub i8 64, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i64
+ %highbitscleared = shl i64 %shifted, %sh_prom
+ %masked = lshr i64 %highbitscleared, %sh_prom
+ ret i64 %masked
+}
+
+; 64-bit, but with 32-bit output
+
+; Everything done in 64-bit, truncation happens last.
+define i32 @bextr64_32_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_d0:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r4, lr}
+; V7M-NEXT: push {r4, lr}
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: ldr.w r12, [sp, #8]
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orrs r0, r3
+; V7M-NEXT: subs.w r3, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: rsb.w r3, r12, #64
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: rsb.w lr, r12, #32
+; V7M-NEXT: rsb.w r12, r3, #32
+; V7M-NEXT: lsls r1, r3
+; V7M-NEXT: cmp.w lr, #0
+; V7M-NEXT: lsr.w r4, r0, r12
+; V7M-NEXT: orr.w r1, r1, r4
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, r0, lr
+; V7M-NEXT: lsl.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsl.w r2, r1, r12
+; V7M-NEXT: lsr.w r0, r0, r3
+; V7M-NEXT: orr.w r0, r0, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, lr
+; V7M-NEXT: pop {r4, pc}
+;
+; V7A-LABEL: bextr64_32_d0:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: lsr r3, r1, r2
+; V7A-NEXT: subs lr, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: rsb r2, r2, #32
+; V7A-NEXT: ldr r12, [sp, #8]
+; V7A-NEXT: movwpl r3, #0
+; V7A-NEXT: orr r0, r0, r1, lsl r2
+; V7A-NEXT: lsrpl r0, r1, lr
+; V7A-NEXT: rsb r1, r12, #64
+; V7A-NEXT: rsb lr, r1, #32
+; V7A-NEXT: lsr r2, r0, lr
+; V7A-NEXT: orr r2, r2, r3, lsl r1
+; V7A-NEXT: rsbs r3, r12, #32
+; V7A-NEXT: lslpl r2, r0, r3
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: orr r0, r0, r2, lsl lr
+; V7A-NEXT: lsrpl r0, r2, r3
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_32_d0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: ldr.w r12, [sp, #8]
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orrs r0, r3
+; V7A-T-NEXT: subs.w r3, r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r3
+; V7A-T-NEXT: lsr.w r1, r1, r2
+; V7A-T-NEXT: rsb.w r3, r12, #64
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: rsb.w lr, r3, #32
+; V7A-T-NEXT: lsls r1, r3
+; V7A-T-NEXT: rsbs.w r2, r12, #32
+; V7A-T-NEXT: lsr.w r4, r0, lr
+; V7A-T-NEXT: orr.w r1, r1, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r1, r0, r2
+; V7A-T-NEXT: lsl.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsl.w r4, r1, lr
+; V7A-T-NEXT: lsr.w r0, r0, r3
+; V7A-T-NEXT: orr.w r0, r0, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bextr64_32_d0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldr r2, [sp, #8]
+; V6M-NEXT: movs r3, #64
+; V6M-NEXT: subs r4, r3, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %numhighbits = sub i64 64, %numlowbits
+ %highbitscleared = shl i64 %shifted, %numhighbits
+ %masked = lshr i64 %highbitscleared, %numhighbits
+ %res = trunc i64 %masked to i32
+ ret i32 %res
+}
+
+; Shifting happens in 64-bit, then truncation. Masking is 32-bit.
+define i32 @bextr64_32_d1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_d1:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsrs r0, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: ldr r1, [sp]
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bextr64_32_d1:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: ldr r12, [sp]
+; V7A-NEXT: subs r2, r2, #32
+; V7A-NEXT: orr r0, r0, r1, lsl r3
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: rsb r1, r12, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bextr64_32_d1:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: lsrs r0, r2
+; V7A-T-NEXT: ldr.w r12, [sp]
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: lsl.w r3, r1, r3
+; V7A-T-NEXT: orr.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: rsb.w r1, r12, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bextr64_32_d1:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r7, lr}
+; V6M-NEXT: push {r7, lr}
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldr r1, [sp, #8]
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: pop {r7, pc}
+ %shifted = lshr i64 %val, %numskipbits
+ %truncshifted = trunc i64 %shifted to i32
+ %numhighbits = sub i32 32, %numlowbits
+ %highbitscleared = shl i32 %truncshifted, %numhighbits
+ %masked = lshr i32 %highbitscleared, %numhighbits
+ ret i32 %masked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Constant
+; ---------------------------------------------------------------------------- ;
+
+; https://bugs.llvm.org/show_bug.cgi?id=38938
+define void @pr38938(ptr %a0, ptr %a1) nounwind {
+; V7M-LABEL: pr38938:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r1, [r1]
+; V7M-NEXT: ubfx r1, r1, #21, #10
+; V7M-NEXT: ldr.w r2, [r0, r1, lsl #2]
+; V7M-NEXT: adds r2, #1
+; V7M-NEXT: str.w r2, [r0, r1, lsl #2]
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: pr38938:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r1, [r1]
+; V7A-NEXT: ubfx r1, r1, #21, #10
+; V7A-NEXT: ldr r2, [r0, r1, lsl #2]
+; V7A-NEXT: add r2, r2, #1
+; V7A-NEXT: str r2, [r0, r1, lsl #2]
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: pr38938:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r1, [r1]
+; V7A-T-NEXT: ubfx r1, r1, #21, #10
+; V7A-T-NEXT: ldr.w r2, [r0, r1, lsl #2]
+; V7A-T-NEXT: adds r2, #1
+; V7A-T-NEXT: str.w r2, [r0, r1, lsl #2]
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: pr38938:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r1, [r1]
+; V6M-NEXT: lsrs r1, r1, #19
+; V6M-NEXT: ldr r2, .LCPI51_0
+; V6M-NEXT: ands r2, r1
+; V6M-NEXT: ldr r1, [r0, r2]
+; V6M-NEXT: adds r1, r1, #1
+; V6M-NEXT: str r1, [r0, r2]
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI51_0:
+; V6M-NEXT: .long 4092 @ 0xffc
+ %tmp = load i64, ptr %a1, align 8
+ %tmp1 = lshr i64 %tmp, 21
+ %tmp2 = and i64 %tmp1, 1023
+ %tmp3 = getelementptr inbounds i32, ptr %a0, i64 %tmp2
+ %tmp4 = load i32, ptr %tmp3, align 4
+ %tmp5 = add nsw i32 %tmp4, 1
+ store i32 %tmp5, ptr %tmp3, align 4
+ ret void
+}
+
+; The most canonical variant
+define i32 @c0_i32(i32 %arg) nounwind {
+; V7M-LABEL: c0_i32:
+; V7M: @ %bb.0:
+; V7M-NEXT: ubfx r0, r0, #19, #10
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c0_i32:
+; V7A: @ %bb.0:
+; V7A-NEXT: ubfx r0, r0, #19, #10
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c0_i32:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ubfx r0, r0, #19, #10
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c0_i32:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsls r0, r0, #3
+; V6M-NEXT: lsrs r0, r0, #22
+; V6M-NEXT: bx lr
+ %tmp0 = lshr i32 %arg, 19
+ %tmp1 = and i32 %tmp0, 1023
+ ret i32 %tmp1
+}
+
+; Should be still fine, but the mask is shifted
+define i32 @c1_i32(i32 %arg) nounwind {
+; V7M-LABEL: c1_i32:
+; V7M: @ %bb.0:
+; V7M-NEXT: movw r1, #4092
+; V7M-NEXT: and.w r0, r1, r0, lsr #19
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c1_i32:
+; V7A: @ %bb.0:
+; V7A-NEXT: movw r1, #4092
+; V7A-NEXT: and r0, r1, r0, lsr #19
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c1_i32:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movw r1, #4092
+; V7A-T-NEXT: and.w r0, r1, r0, lsr #19
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c1_i32:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r1, r0, #19
+; V6M-NEXT: ldr r0, .LCPI53_0
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI53_0:
+; V6M-NEXT: .long 4092 @ 0xffc
+ %tmp0 = lshr i32 %arg, 19
+ %tmp1 = and i32 %tmp0, 4092
+ ret i32 %tmp1
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define i32 @c2_i32(i32 %arg) nounwind {
+; V7M-LABEL: c2_i32:
+; V7M: @ %bb.0:
+; V7M-NEXT: movw r1, #4092
+; V7M-NEXT: and.w r0, r1, r0, lsr #17
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c2_i32:
+; V7A: @ %bb.0:
+; V7A-NEXT: movw r1, #4092
+; V7A-NEXT: and r0, r1, r0, lsr #17
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c2_i32:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movw r1, #4092
+; V7A-T-NEXT: and.w r0, r1, r0, lsr #17
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c2_i32:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r1, r0, #17
+; V6M-NEXT: ldr r0, .LCPI54_0
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI54_0:
+; V6M-NEXT: .long 4092 @ 0xffc
+ %tmp0 = lshr i32 %arg, 19
+ %tmp1 = and i32 %tmp0, 1023
+ %tmp2 = shl i32 %tmp1, 2
+ ret i32 %tmp2
+}
+
+; The mask covers newly shifted-in bit
+define i32 @c4_i32_bad(i32 %arg) nounwind {
+; V7M-LABEL: c4_i32_bad:
+; V7M: @ %bb.0:
+; V7M-NEXT: mvn r1, #1
+; V7M-NEXT: and.w r0, r1, r0, lsr #19
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c4_i32_bad:
+; V7A: @ %bb.0:
+; V7A-NEXT: mvn r1, #1
+; V7A-NEXT: and r0, r1, r0, lsr #19
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c4_i32_bad:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mvn r1, #1
+; V7A-T-NEXT: and.w r0, r1, r0, lsr #19
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c4_i32_bad:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r0, #20
+; V6M-NEXT: lsls r0, r0, #1
+; V6M-NEXT: bx lr
+ %tmp0 = lshr i32 %arg, 19
+ %tmp1 = and i32 %tmp0, 16382
+ ret i32 %tmp1
+}
+
+; i64
+
+; The most canonical variant
+define i64 @c0_i64(i64 %arg) nounwind {
+; V7M-LABEL: c0_i64:
+; V7M: @ %bb.0:
+; V7M-NEXT: ubfx r0, r1, #19, #10
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c0_i64:
+; V7A: @ %bb.0:
+; V7A-NEXT: ubfx r0, r1, #19, #10
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c0_i64:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ubfx r0, r1, #19, #10
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c0_i64:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsls r0, r1, #3
+; V6M-NEXT: lsrs r0, r0, #22
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+ %tmp0 = lshr i64 %arg, 51
+ %tmp1 = and i64 %tmp0, 1023
+ ret i64 %tmp1
+}
+
+; Should be still fine, but the mask is shifted
+define i64 @c1_i64(i64 %arg) nounwind {
+; V7M-LABEL: c1_i64:
+; V7M: @ %bb.0:
+; V7M-NEXT: movw r0, #4092
+; V7M-NEXT: and.w r0, r0, r1, lsr #19
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c1_i64:
+; V7A: @ %bb.0:
+; V7A-NEXT: movw r0, #4092
+; V7A-NEXT: and r0, r0, r1, lsr #19
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c1_i64:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movw r0, #4092
+; V7A-T-NEXT: and.w r0, r0, r1, lsr #19
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c1_i64:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r1, r1, #19
+; V6M-NEXT: ldr r0, .LCPI57_0
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI57_0:
+; V6M-NEXT: .long 4092 @ 0xffc
+ %tmp0 = lshr i64 %arg, 51
+ %tmp1 = and i64 %tmp0, 4092
+ ret i64 %tmp1
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define i64 @c2_i64(i64 %arg) nounwind {
+; V7M-LABEL: c2_i64:
+; V7M: @ %bb.0:
+; V7M-NEXT: movw r0, #4092
+; V7M-NEXT: and.w r0, r0, r1, lsr #17
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c2_i64:
+; V7A: @ %bb.0:
+; V7A-NEXT: movw r0, #4092
+; V7A-NEXT: and r0, r0, r1, lsr #17
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c2_i64:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movw r0, #4092
+; V7A-T-NEXT: and.w r0, r0, r1, lsr #17
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c2_i64:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r1, r1, #17
+; V6M-NEXT: ldr r0, .LCPI58_0
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI58_0:
+; V6M-NEXT: .long 4092 @ 0xffc
+ %tmp0 = lshr i64 %arg, 51
+ %tmp1 = and i64 %tmp0, 1023
+ %tmp2 = shl i64 %tmp1, 2
+ ret i64 %tmp2
+}
+
+; The mask covers newly shifted-in bit
+define i64 @c4_i64_bad(i64 %arg) nounwind {
+; V7M-LABEL: c4_i64_bad:
+; V7M: @ %bb.0:
+; V7M-NEXT: mvn r0, #1
+; V7M-NEXT: and.w r0, r0, r1, lsr #19
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c4_i64_bad:
+; V7A: @ %bb.0:
+; V7A-NEXT: mvn r0, #1
+; V7A-NEXT: and r0, r0, r1, lsr #19
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c4_i64_bad:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mvn r0, #1
+; V7A-T-NEXT: and.w r0, r0, r1, lsr #19
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c4_i64_bad:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r1, #20
+; V6M-NEXT: lsls r0, r0, #1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+ %tmp0 = lshr i64 %arg, 51
+ %tmp1 = and i64 %tmp0, 16382
+ ret i64 %tmp1
+}
+
+; ---------------------------------------------------------------------------- ;
+; Constant, storing the result afterwards.
+; ---------------------------------------------------------------------------- ;
+
+; i32
+
+; The most canonical variant
+define void @c5_i32(i32 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c5_i32:
+; V7M: @ %bb.0:
+; V7M-NEXT: ubfx r0, r0, #19, #10
+; V7M-NEXT: str r0, [r1]
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c5_i32:
+; V7A: @ %bb.0:
+; V7A-NEXT: ubfx r0, r0, #19, #10
+; V7A-NEXT: str r0, [r1]
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c5_i32:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ubfx r0, r0, #19, #10
+; V7A-T-NEXT: str r0, [r1]
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c5_i32:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsls r0, r0, #3
+; V6M-NEXT: lsrs r0, r0, #22
+; V6M-NEXT: str r0, [r1]
+; V6M-NEXT: bx lr
+ %tmp0 = lshr i32 %arg, 19
+ %tmp1 = and i32 %tmp0, 1023
+ store i32 %tmp1, ptr %ptr
+ ret void
+}
+
+; Should be still fine, but the mask is shifted
+define void @c6_i32(i32 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c6_i32:
+; V7M: @ %bb.0:
+; V7M-NEXT: ubfx r0, r0, #19, #12
+; V7M-NEXT: str r0, [r1]
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c6_i32:
+; V7A: @ %bb.0:
+; V7A-NEXT: ubfx r0, r0, #19, #12
+; V7A-NEXT: str r0, [r1]
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c6_i32:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ubfx r0, r0, #19, #12
+; V7A-T-NEXT: str r0, [r1]
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c6_i32:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsls r0, r0, #1
+; V6M-NEXT: lsrs r0, r0, #20
+; V6M-NEXT: str r0, [r1]
+; V6M-NEXT: bx lr
+ %tmp0 = lshr i32 %arg, 19
+ %tmp1 = and i32 %tmp0, 4095
+ store i32 %tmp1, ptr %ptr
+ ret void
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define void @c7_i32(i32 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c7_i32:
+; V7M: @ %bb.0:
+; V7M-NEXT: movw r2, #4092
+; V7M-NEXT: and.w r0, r2, r0, lsr #17
+; V7M-NEXT: str r0, [r1]
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c7_i32:
+; V7A: @ %bb.0:
+; V7A-NEXT: movw r2, #4092
+; V7A-NEXT: and r0, r2, r0, lsr #17
+; V7A-NEXT: str r0, [r1]
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c7_i32:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movw r2, #4092
+; V7A-T-NEXT: and.w r0, r2, r0, lsr #17
+; V7A-T-NEXT: str r0, [r1]
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c7_i32:
+; V6M: @ %bb.0:
+; V6M-NEXT: lsrs r0, r0, #17
+; V6M-NEXT: ldr r2, .LCPI62_0
+; V6M-NEXT: ands r2, r0
+; V6M-NEXT: str r2, [r1]
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI62_0:
+; V6M-NEXT: .long 4092 @ 0xffc
+ %tmp0 = lshr i32 %arg, 19
+ %tmp1 = and i32 %tmp0, 1023
+ %tmp2 = shl i32 %tmp1, 2
+ store i32 %tmp2, ptr %ptr
+ ret void
+}
+
+; i64
+
+; The most canonical variant
+define void @c5_i64(i64 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c5_i64:
+; V7M: @ %bb.0:
+; V7M-NEXT: movs r0, #0
+; V7M-NEXT: ubfx r1, r1, #19, #10
+; V7M-NEXT: strd r1, r0, [r2]
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c5_i64:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r0, #0
+; V7A-NEXT: str r0, [r2, #4]
+; V7A-NEXT: ubfx r0, r1, #19, #10
+; V7A-NEXT: str r0, [r2]
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c5_i64:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movs r0, #0
+; V7A-T-NEXT: ubfx r1, r1, #19, #10
+; V7A-T-NEXT: strd r1, r0, [r2]
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c5_i64:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: lsls r1, r1, #3
+; V6M-NEXT: lsrs r1, r1, #22
+; V6M-NEXT: str r1, [r2]
+; V6M-NEXT: str r0, [r2, #4]
+; V6M-NEXT: bx lr
+ %tmp0 = lshr i64 %arg, 51
+ %tmp1 = and i64 %tmp0, 1023
+ store i64 %tmp1, ptr %ptr
+ ret void
+}
+
+; Should be still fine, but the mask is shifted
+define void @c6_i64(i64 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c6_i64:
+; V7M: @ %bb.0:
+; V7M-NEXT: movs r0, #0
+; V7M-NEXT: ubfx r1, r1, #19, #12
+; V7M-NEXT: strd r1, r0, [r2]
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c6_i64:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r0, #0
+; V7A-NEXT: str r0, [r2, #4]
+; V7A-NEXT: ubfx r0, r1, #19, #12
+; V7A-NEXT: str r0, [r2]
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c6_i64:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movs r0, #0
+; V7A-T-NEXT: ubfx r1, r1, #19, #12
+; V7A-T-NEXT: strd r1, r0, [r2]
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c6_i64:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: lsls r1, r1, #1
+; V6M-NEXT: lsrs r1, r1, #20
+; V6M-NEXT: str r1, [r2]
+; V6M-NEXT: str r0, [r2, #4]
+; V6M-NEXT: bx lr
+ %tmp0 = lshr i64 %arg, 51
+ %tmp1 = and i64 %tmp0, 4095
+ store i64 %tmp1, ptr %ptr
+ ret void
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define void @c7_i64(i64 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c7_i64:
+; V7M: @ %bb.0:
+; V7M-NEXT: movs r0, #0
+; V7M-NEXT: movw r3, #4092
+; V7M-NEXT: and.w r1, r3, r1, lsr #17
+; V7M-NEXT: strd r1, r0, [r2]
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: c7_i64:
+; V7A: @ %bb.0:
+; V7A-NEXT: movw r0, #4092
+; V7A-NEXT: mov r3, #0
+; V7A-NEXT: and r0, r0, r1, lsr #17
+; V7A-NEXT: stm r2, {r0, r3}
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: c7_i64:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movs r0, #0
+; V7A-T-NEXT: movw r3, #4092
+; V7A-T-NEXT: and.w r1, r3, r1, lsr #17
+; V7A-T-NEXT: strd r1, r0, [r2]
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: c7_i64:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: lsrs r1, r1, #17
+; V6M-NEXT: ldr r3, .LCPI65_0
+; V6M-NEXT: ands r3, r1
+; V6M-NEXT: str r3, [r2]
+; V6M-NEXT: str r0, [r2, #4]
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI65_0:
+; V6M-NEXT: .long 4092 @ 0xffc
+ %tmp0 = lshr i64 %arg, 51
+ %tmp1 = and i64 %tmp0, 1023
+ %tmp2 = shl i64 %tmp1, 2
+ store i64 %tmp2, ptr %ptr
+ ret void
+}
diff --git a/llvm/test/CodeGen/ARM/extract-lowbits.ll b/llvm/test/CodeGen/ARM/extract-lowbits.ll
new file mode 100644
index 0000000..b483793
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/extract-lowbits.ll
@@ -0,0 +1,2752 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s --check-prefix V7M
+; RUN: llc -mtriple=armv7a-eabi %s -o - | FileCheck %s --check-prefix V7A
+; RUN: llc -mtriple=thumbv7a-eabi %s -o - | FileCheck %s --check-prefix V7A-T
+; RUN: llc -mtriple=armv6m-eabi %s -o - | FileCheck %s --check-prefix V6M
+
+; Patterns:
+; a) x & (1 << nbits) - 1
+; b) x & ~(-1 << nbits)
+; c) x & (-1 >> (32 - y))
+; d) x << (32 - y) >> (32 - y)
+; are equivalent.
+
+; ---------------------------------------------------------------------------- ;
+; Pattern a. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bzhi32_a0(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_a0:
+; V7M: @ %bb.0:
+; V7M-NEXT: movs r2, #1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_a0:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r2, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r1, r3, r2, lsl r1
+; V7A-NEXT: and r0, r1, r0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_a0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movs r2, #1
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_a0:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #1
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: subs r1, r2, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_a1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: movs r2, #1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_a1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r2, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r1, r3, r2, lsl r1
+; V7A-NEXT: and r0, r1, r0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_a1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movs r2, #1
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_a1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #1
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: subs r1, r2, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %conv = zext i8 %numlowbits to i32
+ %onebit = shl i32 1, %conv
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_a2_load(ptr %w, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_a2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: movs r2, #1
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_a2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r2, #1
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r1, r3, r2, lsl r1
+; V7A-NEXT: and r0, r1, r0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_a2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movs r2, #1
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_a2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #1
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: subs r1, r2, #1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_a3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_a3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: movs r2, #1
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_a3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r2, #1
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r1, r3, r2, lsl r1
+; V7A-NEXT: and r0, r1, r0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_a3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movs r2, #1
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_a3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #1
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: subs r1, r2, #1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %conv = zext i8 %numlowbits to i32
+ %onebit = shl i32 1, %conv
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_a4_commutative(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_a4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: movs r2, #1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: subs r1, #1
+; V7M-NEXT: ands r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_a4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: mov r2, #1
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: add r1, r3, r2, lsl r1
+; V7A-NEXT: and r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_a4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: movs r2, #1
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: subs r1, #1
+; V7A-T-NEXT: ands r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_a4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #1
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: subs r1, r2, #1
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %onebit = shl i32 1, %numlowbits
+ %mask = add nsw i32 %onebit, -1
+ %masked = and i32 %val, %mask ; swapped order
+ ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bzhi64_a0(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a0:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: mov.w r12, #1
+; V7M-NEXT: subs.w lr, r2, #32
+; V7M-NEXT: lsl.w r2, r12, r2
+; V7M-NEXT: lsr.w r3, r12, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r3, r12, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: subs r2, #1
+; V7M-NEXT: sbc r3, r3, #0
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_a0:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: lsr lr, r12, r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsl r2, r12, r2
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: lslpl lr, r12, r3
+; V7A-NEXT: subs r2, r2, #1
+; V7A-NEXT: sbc r3, lr, #0
+; V7A-NEXT: and r0, r2, r0
+; V7A-NEXT: and r1, r3, r1
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: mov.w r12, #1
+; V7A-T-NEXT: subs.w lr, r2, #32
+; V7A-T-NEXT: lsl.w r2, r12, r2
+; V7A-T-NEXT: lsr.w r3, r12, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r3, r12, lr
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: subs r2, #1
+; V7A-T-NEXT: sbc r3, r3, #0
+; V7A-T-NEXT: ands r0, r2
+; V7A-T-NEXT: ands r1, r3
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, lr}
+; V6M-NEXT: push {r4, r5, r6, lr}
+; V6M-NEXT: mov r5, r1
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r6, #0
+; V6M-NEXT: mov r1, r6
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: subs r0, r0, #1
+; V6M-NEXT: sbcs r1, r6
+; V6M-NEXT: ands r1, r5
+; V6M-NEXT: ands r0, r4
+; V6M-NEXT: pop {r4, r5, r6, pc}
+ %onebit = shl i64 1, %numlowbits
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+; Check that we don't throw away the vreg_width-1 mask if not using shifts
+define i64 @bzhi64_a0_masked(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a0_masked:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: and r2, r2, #63
+; V7M-NEXT: mov.w r12, #1
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: subs.w lr, r2, #32
+; V7M-NEXT: lsl.w r2, r12, r2
+; V7M-NEXT: lsr.w r3, r12, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r3, r12, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: subs r2, #1
+; V7M-NEXT: sbc r3, r3, #0
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_a0_masked:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: and r2, r2, #63
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr lr, r12, r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsl r2, r12, r2
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: lslpl lr, r12, r3
+; V7A-NEXT: subs r2, r2, #1
+; V7A-NEXT: sbc r3, lr, #0
+; V7A-NEXT: and r0, r2, r0
+; V7A-NEXT: and r1, r3, r1
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a0_masked:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: and r2, r2, #63
+; V7A-T-NEXT: mov.w r12, #1
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: subs.w lr, r2, #32
+; V7A-T-NEXT: lsl.w r2, r12, r2
+; V7A-T-NEXT: lsr.w r3, r12, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r3, r12, lr
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: subs r2, #1
+; V7A-T-NEXT: sbc r3, r3, #0
+; V7A-T-NEXT: ands r0, r2
+; V7A-T-NEXT: ands r1, r3
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a0_masked:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, lr}
+; V6M-NEXT: push {r4, r5, r6, lr}
+; V6M-NEXT: mov r5, r1
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #63
+; V6M-NEXT: ands r2, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r6, #0
+; V6M-NEXT: mov r1, r6
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: subs r0, r0, #1
+; V6M-NEXT: sbcs r1, r6
+; V6M-NEXT: ands r1, r5
+; V6M-NEXT: ands r0, r4
+; V6M-NEXT: pop {r4, r5, r6, pc}
+ %numlowbits.masked = and i64 %numlowbits, 63
+ %onebit = shl i64 1, %numlowbits.masked
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_a1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: mov.w r12, #1
+; V7M-NEXT: subs.w lr, r2, #32
+; V7M-NEXT: lsl.w r2, r12, r2
+; V7M-NEXT: lsr.w r3, r12, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r3, r12, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: subs r2, #1
+; V7M-NEXT: sbc r3, r3, #0
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_a1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: lsr lr, r12, r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsl r2, r12, r2
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: lslpl lr, r12, r3
+; V7A-NEXT: subs r2, r2, #1
+; V7A-NEXT: sbc r3, lr, #0
+; V7A-NEXT: and r0, r2, r0
+; V7A-NEXT: and r1, r3, r1
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: mov.w r12, #1
+; V7A-T-NEXT: subs.w lr, r2, #32
+; V7A-T-NEXT: lsl.w r2, r12, r2
+; V7A-T-NEXT: lsr.w r3, r12, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r3, r12, lr
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: subs r2, #1
+; V7A-T-NEXT: sbc r3, r3, #0
+; V7A-T-NEXT: ands r0, r2
+; V7A-T-NEXT: ands r1, r3
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, lr}
+; V6M-NEXT: push {r4, r5, r6, lr}
+; V6M-NEXT: mov r5, r1
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r6, #0
+; V6M-NEXT: mov r1, r6
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: subs r0, r0, #1
+; V6M-NEXT: sbcs r1, r6
+; V6M-NEXT: ands r1, r5
+; V6M-NEXT: ands r0, r4
+; V6M-NEXT: pop {r4, r5, r6, pc}
+ %conv = zext i8 %numlowbits to i64
+ %onebit = shl i64 1, %conv
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_a2_load(ptr %w, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r2, #32
+; V7M-NEXT: movs r3, #1
+; V7M-NEXT: subs.w r12, r2, #32
+; V7M-NEXT: lsl.w r2, r3, r2
+; V7M-NEXT: lsr.w r1, r3, r1
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, r3, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: subs r2, #1
+; V7M-NEXT: ldrd r0, r3, [r0]
+; V7M-NEXT: sbc r1, r1, #0
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_a2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r6, r11, lr}
+; V7A-NEXT: push {r4, r6, r11, lr}
+; V7A-NEXT: ldr r6, [r0]
+; V7A-NEXT: mov r1, #1
+; V7A-NEXT: ldr r3, [r0, #4]
+; V7A-NEXT: rsb r0, r2, #32
+; V7A-NEXT: subs r4, r2, #32
+; V7A-NEXT: lsr r0, r1, r0
+; V7A-NEXT: lslpl r0, r1, r4
+; V7A-NEXT: lsl r1, r1, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: subs r2, r1, #1
+; V7A-NEXT: sbc r0, r0, #0
+; V7A-NEXT: and r1, r0, r3
+; V7A-NEXT: and r0, r2, r6
+; V7A-NEXT: pop {r4, r6, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: movs r1, #1
+; V7A-T-NEXT: ldrd r12, lr, [r0]
+; V7A-T-NEXT: subs.w r0, r2, #32
+; V7A-T-NEXT: lsr.w r3, r1, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r3, r1, r0
+; V7A-T-NEXT: lsl.w r0, r1, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: subs r0, #1
+; V7A-T-NEXT: sbc r1, r3, #0
+; V7A-T-NEXT: and.w r0, r0, r12
+; V7A-T-NEXT: and.w r1, r1, lr
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r5, #0
+; V6M-NEXT: mov r1, r5
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: subs r2, r0, #1
+; V6M-NEXT: sbcs r1, r5
+; V6M-NEXT: ldm r4!, {r0, r3}
+; V6M-NEXT: ands r1, r3
+; V6M-NEXT: ands r0, r2
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %val = load i64, ptr %w
+ %onebit = shl i64 1, %numlowbits
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_a3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r2, r1, #32
+; V7M-NEXT: movs r3, #1
+; V7M-NEXT: subs.w r12, r1, #32
+; V7M-NEXT: lsl.w r1, r3, r1
+; V7M-NEXT: lsr.w r2, r3, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r2, r3, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: subs r3, r1, #1
+; V7M-NEXT: sbc r1, r2, #0
+; V7M-NEXT: ldrd r0, r2, [r0]
+; V7M-NEXT: ands r1, r2
+; V7M-NEXT: ands r0, r3
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_a3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r6, r11, lr}
+; V7A-NEXT: push {r4, r6, r11, lr}
+; V7A-NEXT: ldr r6, [r0]
+; V7A-NEXT: mov r2, #1
+; V7A-NEXT: ldr r3, [r0, #4]
+; V7A-NEXT: rsb r0, r1, #32
+; V7A-NEXT: subs r4, r1, #32
+; V7A-NEXT: lsl r1, r2, r1
+; V7A-NEXT: lsr r0, r2, r0
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: lslpl r0, r2, r4
+; V7A-NEXT: subs r2, r1, #1
+; V7A-NEXT: sbc r0, r0, #0
+; V7A-NEXT: and r1, r0, r3
+; V7A-NEXT: and r0, r2, r6
+; V7A-NEXT: pop {r4, r6, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r1, #32
+; V7A-T-NEXT: movs r2, #1
+; V7A-T-NEXT: ldrd r12, lr, [r0]
+; V7A-T-NEXT: subs.w r0, r1, #32
+; V7A-T-NEXT: lsr.w r3, r2, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r3, r2, r0
+; V7A-T-NEXT: lsl.w r0, r2, r1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: subs r0, #1
+; V7A-T-NEXT: sbc r1, r3, #0
+; V7A-T-NEXT: and.w r0, r0, r12
+; V7A-T-NEXT: and.w r1, r1, lr
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r2, r1
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r5, #0
+; V6M-NEXT: mov r1, r5
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: subs r2, r0, #1
+; V6M-NEXT: sbcs r1, r5
+; V6M-NEXT: ldm r4!, {r0, r3}
+; V6M-NEXT: ands r1, r3
+; V6M-NEXT: ands r0, r2
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %val = load i64, ptr %w
+ %conv = zext i8 %numlowbits to i64
+ %onebit = shl i64 1, %conv
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_a4_commutative(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: mov.w r12, #1
+; V7M-NEXT: subs.w lr, r2, #32
+; V7M-NEXT: lsl.w r2, r12, r2
+; V7M-NEXT: lsr.w r3, r12, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r3, r12, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: subs r2, #1
+; V7M-NEXT: sbc r3, r3, #0
+; V7M-NEXT: ands r0, r2
+; V7M-NEXT: ands r1, r3
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_a4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: mov r12, #1
+; V7A-NEXT: lsr lr, r12, r3
+; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: lsl r2, r12, r2
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: lslpl lr, r12, r3
+; V7A-NEXT: subs r2, r2, #1
+; V7A-NEXT: sbc r3, lr, #0
+; V7A-NEXT: and r0, r0, r2
+; V7A-NEXT: and r1, r1, r3
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #32
+; V7A-T-NEXT: mov.w r12, #1
+; V7A-T-NEXT: subs.w lr, r2, #32
+; V7A-T-NEXT: lsl.w r2, r12, r2
+; V7A-T-NEXT: lsr.w r3, r12, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r3, r12, lr
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: subs r2, #1
+; V7A-T-NEXT: sbc r3, r3, #0
+; V7A-T-NEXT: ands r0, r2
+; V7A-T-NEXT: ands r1, r3
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r6, lr}
+; V6M-NEXT: push {r4, r5, r6, lr}
+; V6M-NEXT: mov r5, r1
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #1
+; V6M-NEXT: movs r6, #0
+; V6M-NEXT: mov r1, r6
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: subs r0, r0, #1
+; V6M-NEXT: sbcs r1, r6
+; V6M-NEXT: ands r1, r5
+; V6M-NEXT: ands r0, r4
+; V6M-NEXT: pop {r4, r5, r6, pc}
+ %onebit = shl i64 1, %numlowbits
+ %mask = add nsw i64 %onebit, -1
+ %masked = and i64 %val, %mask ; swapped order
+ ret i64 %masked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern b. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bzhi32_b0(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_b0:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: bics r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_b0:
+; V7A: @ %bb.0:
+; V7A-NEXT: mvn r2, #0
+; V7A-NEXT: bic r0, r0, r2, lsl r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_b0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: bics r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_b0:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #0
+; V6M-NEXT: mvns r2, r2
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: bics r0, r2
+; V6M-NEXT: bx lr
+ %notmask = shl i32 -1, %numlowbits
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_b1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: bics r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_b1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: mvn r2, #0
+; V7A-NEXT: bic r0, r0, r2, lsl r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_b1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: bics r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_b1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #0
+; V6M-NEXT: mvns r2, r2
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: bics r0, r2
+; V6M-NEXT: bx lr
+ %conv = zext i8 %numlowbits to i32
+ %notmask = shl i32 -1, %conv
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_b2_load(ptr %w, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_b2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: bics r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_b2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mvn r2, #0
+; V7A-NEXT: bic r0, r0, r2, lsl r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_b2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: bics r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_b2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #0
+; V6M-NEXT: mvns r2, r2
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: bics r0, r2
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %notmask = shl i32 -1, %numlowbits
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_b3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_b3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: bics r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_b3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mvn r2, #0
+; V7A-NEXT: bic r0, r0, r2, lsl r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_b3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: bics r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_b3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #0
+; V6M-NEXT: mvns r2, r2
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: bics r0, r2
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %conv = zext i8 %numlowbits to i32
+ %notmask = shl i32 -1, %conv
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_b4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: lsl.w r1, r2, r1
+; V7M-NEXT: bics r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_b4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: mvn r2, #0
+; V7A-NEXT: bic r0, r0, r2, lsl r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_b4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: lsl.w r1, r2, r1
+; V7A-T-NEXT: bics r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_b4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #0
+; V6M-NEXT: mvns r2, r2
+; V6M-NEXT: lsls r2, r1
+; V6M-NEXT: bics r0, r2
+; V6M-NEXT: bx lr
+ %notmask = shl i32 -1, %numlowbits
+ %mask = xor i32 %notmask, -1
+ %masked = and i32 %val, %mask ; swapped order
+ ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bzhi64_b0(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_b0:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsl.w r12, r3, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl.w r12, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl r3, r2
+; V7M-NEXT: bic.w r0, r0, r12
+; V7M-NEXT: bics r1, r3
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_b0:
+; V7A: @ %bb.0:
+; V7A-NEXT: subs r12, r2, #32
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsl r2, r3, r2
+; V7A-NEXT: lslpl r3, r3, r12
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: bic r1, r1, r3
+; V7A-NEXT: bic r0, r0, r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_b0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsl.w r12, r3, r2
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl.w r12, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl r3, r2
+; V7A-T-NEXT: bic.w r0, r0, r12
+; V7A-T-NEXT: bics r1, r3
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_b0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r5, r0
+; V6M-NEXT: bics r4, r1
+; V6M-NEXT: mov r0, r5
+; V6M-NEXT: mov r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %notmask = shl i64 -1, %numlowbits
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_b1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_b1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsl.w r12, r3, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl.w r12, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl r3, r2
+; V7M-NEXT: bic.w r0, r0, r12
+; V7M-NEXT: bics r1, r3
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_b1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: subs r12, r2, #32
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsl r2, r3, r2
+; V7A-NEXT: lslpl r3, r3, r12
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: bic r1, r1, r3
+; V7A-NEXT: bic r0, r0, r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_b1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsl.w r12, r3, r2
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl.w r12, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl r3, r2
+; V7A-T-NEXT: bic.w r0, r0, r12
+; V7A-T-NEXT: bics r1, r3
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_b1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r5, r0
+; V6M-NEXT: bics r4, r1
+; V6M-NEXT: mov r0, r5
+; V6M-NEXT: mov r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %conv = zext i8 %numlowbits to i64
+ %notmask = shl i64 -1, %conv
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_b2_load(ptr %w, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_b2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r1, #-1
+; V7M-NEXT: subs.w r12, r2, #32
+; V7M-NEXT: lsl.w r3, r1, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: ldrd r0, r2, [r0]
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, r1, r12
+; V7M-NEXT: bics r0, r3
+; V7M-NEXT: bic.w r1, r2, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_b2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, lr}
+; V7A-NEXT: push {r4, lr}
+; V7A-NEXT: ldr r4, [r0]
+; V7A-NEXT: mvn r1, #0
+; V7A-NEXT: ldr r3, [r0, #4]
+; V7A-NEXT: subs r0, r2, #32
+; V7A-NEXT: lsl r2, r1, r2
+; V7A-NEXT: lslpl r1, r1, r0
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: bic r1, r3, r1
+; V7A-NEXT: bic r0, r4, r2
+; V7A-NEXT: pop {r4, pc}
+;
+; V7A-T-LABEL: bzhi64_b2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r1, #-1
+; V7A-T-NEXT: ldrd r0, r12, [r0]
+; V7A-T-NEXT: lsl.w r3, r1, r2
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r3, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl r1, r2
+; V7A-T-NEXT: bics r0, r3
+; V7A-T-NEXT: bic.w r1, r12, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_b2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: ldm r4!, {r2, r3}
+; V6M-NEXT: bics r2, r0
+; V6M-NEXT: bics r3, r1
+; V6M-NEXT: mov r0, r2
+; V6M-NEXT: mov r1, r3
+; V6M-NEXT: pop {r4, pc}
+ %val = load i64, ptr %w
+ %notmask = shl i64 -1, %numlowbits
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_b3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_b3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r2, #-1
+; V7M-NEXT: subs.w r12, r1, #32
+; V7M-NEXT: lsl.w r3, r2, r1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r3, #0
+; V7M-NEXT: ldrd r0, r1, [r0]
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r2, r2, r12
+; V7M-NEXT: bics r1, r2
+; V7M-NEXT: bics r0, r3
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_b3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r6, r11, lr}
+; V7A-NEXT: push {r4, r6, r11, lr}
+; V7A-NEXT: mvn r2, #0
+; V7A-NEXT: ldr r6, [r0]
+; V7A-NEXT: ldr r3, [r0, #4]
+; V7A-NEXT: subs r0, r1, #32
+; V7A-NEXT: lsl r4, r2, r1
+; V7A-NEXT: lslpl r2, r2, r0
+; V7A-NEXT: movwpl r4, #0
+; V7A-NEXT: bic r1, r3, r2
+; V7A-NEXT: bic r0, r6, r4
+; V7A-NEXT: pop {r4, r6, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_b3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: ldrd r0, r12, [r0]
+; V7A-T-NEXT: lsl.w r3, r2, r1
+; V7A-T-NEXT: subs r1, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r3, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl r2, r1
+; V7A-T-NEXT: bics r0, r3
+; V7A-T-NEXT: bic.w r1, r12, r2
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_b3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: mov r2, r1
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: ldm r4!, {r2, r3}
+; V6M-NEXT: bics r2, r0
+; V6M-NEXT: bics r3, r1
+; V6M-NEXT: mov r0, r2
+; V6M-NEXT: mov r1, r3
+; V6M-NEXT: pop {r4, pc}
+ %val = load i64, ptr %w
+ %conv = zext i8 %numlowbits to i64
+ %notmask = shl i64 -1, %conv
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_b4_commutative(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_b4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsl.w r12, r3, r2
+; V7M-NEXT: subs r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl.w r12, #0
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl r3, r2
+; V7M-NEXT: bic.w r0, r0, r12
+; V7M-NEXT: bics r1, r3
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_b4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: subs r12, r2, #32
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsl r2, r3, r2
+; V7A-NEXT: lslpl r3, r3, r12
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: bic r1, r1, r3
+; V7A-NEXT: bic r0, r0, r2
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_b4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsl.w r12, r3, r2
+; V7A-T-NEXT: subs r2, #32
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl.w r12, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl r3, r2
+; V7A-T-NEXT: bic.w r0, r0, r12
+; V7A-T-NEXT: bics r1, r3
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_b4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: bics r5, r0
+; V6M-NEXT: bics r4, r1
+; V6M-NEXT: mov r0, r5
+; V6M-NEXT: mov r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %notmask = shl i64 -1, %numlowbits
+ %mask = xor i64 %notmask, -1
+ %masked = and i64 %val, %mask ; swapped order
+ ret i64 %masked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern c. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_c0:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_c0:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_c0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_c0:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %numhighbits = sub i32 32, %numlowbits
+ %mask = lshr i32 -1, %numhighbits
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_c1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_c1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_c1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_c1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %numhighbits = sub i8 32, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i32
+ %mask = lshr i32 -1, %sh_prom
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_c2_load(ptr %w, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_c2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_c2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_c2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_c2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %numhighbits = sub i32 32, %numlowbits
+ %mask = lshr i32 -1, %numhighbits
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_c3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_c3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_c3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_c3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %numhighbits = sub i8 32, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i32
+ %mask = lshr i32 -1, %sh_prom
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
+define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_c4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_c4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_c4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_c4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %numhighbits = sub i32 32, %numlowbits
+ %mask = lshr i32 -1, %numhighbits
+ %masked = and i32 %val, %mask ; swapped order
+ ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_c0:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsbs.w lr, r2, #32
+; V7M-NEXT: rsb.w r2, r2, #64
+; V7M-NEXT: mov.w r12, #-1
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsr.w r2, r12, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r3, r3, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: ands r0, r3
+; V7M-NEXT: ands r1, r2
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_c0:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsbs lr, r2, #32
+; V7A-NEXT: rsb r2, r2, #64
+; V7A-NEXT: mvn r12, #0
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsr r2, r12, r2
+; V7A-NEXT: lsrpl r3, r3, lr
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: and r0, r3, r0
+; V7A-NEXT: and r1, r2, r1
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_c0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsbs.w lr, r2, #32
+; V7A-T-NEXT: rsb.w r2, r2, #64
+; V7A-T-NEXT: mov.w r12, #-1
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsr.w r2, r12, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r3, r3, lr
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: ands r0, r3
+; V7A-T-NEXT: ands r1, r2
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_c0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: movs r0, #64
+; V6M-NEXT: subs r2, r0, r2
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %numhighbits = sub i64 64, %numlowbits
+ %mask = lshr i64 -1, %numhighbits
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_c1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r2, r2, #64
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: uxtb r2, r2
+; V7M-NEXT: subs.w r12, r2, #32
+; V7M-NEXT: lsr.w r2, r3, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r3, r3, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: ands r0, r3
+; V7M-NEXT: ands r1, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_c1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsb lr, r2, #64
+; V7A-NEXT: mvn r2, #31
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: uxtb r12, lr
+; V7A-NEXT: uxtab r2, r2, lr
+; V7A-NEXT: lsr r12, r3, r12
+; V7A-NEXT: cmp r2, #0
+; V7A-NEXT: movwpl r12, #0
+; V7A-NEXT: lsrpl r3, r3, r2
+; V7A-NEXT: and r1, r12, r1
+; V7A-NEXT: and r0, r3, r0
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_c1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w lr, r2, #64
+; V7A-T-NEXT: mvn r2, #31
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: uxtb.w r12, lr
+; V7A-T-NEXT: uxtab r2, r2, lr
+; V7A-T-NEXT: lsr.w r12, r3, r12
+; V7A-T-NEXT: cmp r2, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl.w r12, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl r3, r2
+; V7A-T-NEXT: and.w r1, r1, r12
+; V7A-T-NEXT: ands r0, r3
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_c1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: movs r0, #64
+; V6M-NEXT: subs r0, r0, r2
+; V6M-NEXT: uxtb r2, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %numhighbits = sub i8 64, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i64
+ %mask = lshr i64 -1, %sh_prom
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_c2_load(ptr %w, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_c2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsbs.w r1, r2, #32
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: rsb.w r2, r2, #64
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl r3, r1
+; V7M-NEXT: ldrd r0, r1, [r0]
+; V7M-NEXT: mov.w r12, #-1
+; V7M-NEXT: lsr.w r2, r12, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: ands r0, r3
+; V7M-NEXT: ands r1, r2
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_c2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r5, lr}
+; V7A-NEXT: push {r5, lr}
+; V7A-NEXT: rsbs r1, r2, #32
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: mvn r12, #0
+; V7A-NEXT: ldm r0, {r0, r5}
+; V7A-NEXT: lsrpl r3, r3, r1
+; V7A-NEXT: rsb r1, r2, #64
+; V7A-NEXT: and r0, r3, r0
+; V7A-NEXT: lsr r1, r12, r1
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: and r1, r1, r5
+; V7A-NEXT: pop {r5, pc}
+;
+; V7A-T-LABEL: bzhi64_c2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsbs.w r1, r2, #32
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: ldrd r0, lr, [r0]
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl r3, r1
+; V7A-T-NEXT: rsb.w r1, r2, #64
+; V7A-T-NEXT: mov.w r12, #-1
+; V7A-T-NEXT: and.w r0, r0, r3
+; V7A-T-NEXT: lsr.w r1, r12, r1
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: and.w r1, r1, lr
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_c2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #64
+; V6M-NEXT: subs r2, r0, r2
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldm r4!, {r2, r3}
+; V6M-NEXT: ands r0, r2
+; V6M-NEXT: ands r1, r3
+; V6M-NEXT: pop {r4, pc}
+ %val = load i64, ptr %w
+ %numhighbits = sub i64 64, %numlowbits
+ %mask = lshr i64 -1, %numhighbits
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_c3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #64
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: subs.w r2, r1, #32
+; V7M-NEXT: lsr.w r1, r3, r1
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl r3, r2
+; V7M-NEXT: ldrd r0, r2, [r0]
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: ands r1, r2
+; V7M-NEXT: ands r0, r3
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_c3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r4, r6, r11, lr}
+; V7A-NEXT: push {r4, r6, r11, lr}
+; V7A-NEXT: rsb r1, r1, #64
+; V7A-NEXT: mvn r4, #31
+; V7A-NEXT: mvn r2, #0
+; V7A-NEXT: ldr r6, [r0]
+; V7A-NEXT: ldr r3, [r0, #4]
+; V7A-NEXT: uxtb r0, r1
+; V7A-NEXT: uxtab r4, r4, r1
+; V7A-NEXT: lsr r0, r2, r0
+; V7A-NEXT: cmp r4, #0
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: and r1, r0, r3
+; V7A-NEXT: lsrpl r2, r2, r4
+; V7A-NEXT: and r0, r2, r6
+; V7A-NEXT: pop {r4, r6, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_c3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r1, r1, #64
+; V7A-T-NEXT: mvn r3, #31
+; V7A-T-NEXT: ldrd r12, lr, [r0]
+; V7A-T-NEXT: mov.w r2, #-1
+; V7A-T-NEXT: uxtb r0, r1
+; V7A-T-NEXT: uxtab r3, r3, r1
+; V7A-T-NEXT: lsr.w r0, r2, r0
+; V7A-T-NEXT: cmp r3, #0
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: and.w r1, r0, lr
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl r2, r3
+; V7A-T-NEXT: and.w r0, r2, r12
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_c3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: mov r4, r0
+; V6M-NEXT: movs r0, #64
+; V6M-NEXT: subs r0, r0, r1
+; V6M-NEXT: uxtb r2, r0
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ldm r4!, {r2, r3}
+; V6M-NEXT: ands r0, r2
+; V6M-NEXT: ands r1, r3
+; V6M-NEXT: pop {r4, pc}
+ %val = load i64, ptr %w
+ %numhighbits = sub i8 64, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i64
+ %mask = lshr i64 -1, %sh_prom
+ %masked = and i64 %mask, %val
+ ret i64 %masked
+}
+
+define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_c4_commutative:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsbs.w lr, r2, #32
+; V7M-NEXT: rsb.w r2, r2, #64
+; V7M-NEXT: mov.w r12, #-1
+; V7M-NEXT: mov.w r3, #-1
+; V7M-NEXT: lsr.w r2, r12, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r3, r3, lr
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r2, #0
+; V7M-NEXT: ands r0, r3
+; V7M-NEXT: ands r1, r2
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_c4_commutative:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsbs lr, r2, #32
+; V7A-NEXT: rsb r2, r2, #64
+; V7A-NEXT: mvn r12, #0
+; V7A-NEXT: mvn r3, #0
+; V7A-NEXT: lsr r2, r12, r2
+; V7A-NEXT: lsrpl r3, r3, lr
+; V7A-NEXT: movwpl r2, #0
+; V7A-NEXT: and r0, r0, r3
+; V7A-NEXT: and r1, r1, r2
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_c4_commutative:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsbs.w lr, r2, #32
+; V7A-T-NEXT: rsb.w r2, r2, #64
+; V7A-T-NEXT: mov.w r12, #-1
+; V7A-T-NEXT: mov.w r3, #-1
+; V7A-T-NEXT: lsr.w r2, r12, r2
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r3, r3, lr
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r2, #0
+; V7A-T-NEXT: ands r0, r3
+; V7A-T-NEXT: ands r1, r2
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_c4_commutative:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, r5, r7, lr}
+; V6M-NEXT: push {r4, r5, r7, lr}
+; V6M-NEXT: mov r4, r1
+; V6M-NEXT: mov r5, r0
+; V6M-NEXT: movs r0, #64
+; V6M-NEXT: subs r2, r0, r2
+; V6M-NEXT: movs r0, #0
+; V6M-NEXT: mvns r0, r0
+; V6M-NEXT: mov r1, r0
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: ands r0, r5
+; V6M-NEXT: ands r1, r4
+; V6M-NEXT: pop {r4, r5, r7, pc}
+ %numhighbits = sub i64 64, %numlowbits
+ %mask = lshr i64 -1, %numhighbits
+ %masked = and i64 %val, %mask ; swapped order
+ ret i64 %masked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern d. 32-bit.
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_d0:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_d0:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_d0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_d0:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %numhighbits = sub i32 32, %numlowbits
+ %highbitscleared = shl i32 %val, %numhighbits
+ %masked = lshr i32 %highbitscleared, %numhighbits
+ ret i32 %masked
+}
+
+define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_d1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_d1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_d1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_d1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %numhighbits = sub i8 32, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i32
+ %highbitscleared = shl i32 %val, %sh_prom
+ %masked = lshr i32 %highbitscleared, %sh_prom
+ ret i32 %masked
+}
+
+define i32 @bzhi32_d2_load(ptr %w, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_d2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_d2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_d2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_d2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %numhighbits = sub i32 32, %numlowbits
+ %highbitscleared = shl i32 %val, %numhighbits
+ %masked = lshr i32 %highbitscleared, %numhighbits
+ ret i32 %masked
+}
+
+define i32 @bzhi32_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_d3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #32
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: lsls r0, r1
+; V7M-NEXT: lsrs r0, r1
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_d3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: rsb r1, r1, #32
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: uxtb r1, r1
+; V7A-NEXT: lsl r0, r0, r1
+; V7A-NEXT: lsr r0, r0, r1
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_d3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: rsb.w r1, r1, #32
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: uxtb r1, r1
+; V7A-T-NEXT: lsls r0, r1
+; V7A-T-NEXT: lsrs r0, r1
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_d3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #32
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: uxtb r1, r1
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: lsls r0, r1
+; V6M-NEXT: lsrs r0, r1
+; V6M-NEXT: bx lr
+ %val = load i32, ptr %w
+ %numhighbits = sub i8 32, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i32
+ %highbitscleared = shl i32 %val, %sh_prom
+ %masked = lshr i32 %highbitscleared, %sh_prom
+ ret i32 %masked
+}
+
+; 64-bit.
+
+define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_d0:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsb.w r3, r2, #64
+; V7M-NEXT: rsbs.w r2, r2, #32
+; V7M-NEXT: rsb.w lr, r3, #32
+; V7M-NEXT: lsl.w r12, r1, r3
+; V7M-NEXT: lsr.w r1, r0, lr
+; V7M-NEXT: orr.w r1, r1, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, r0, r2
+; V7M-NEXT: lsl.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsl.w r12, r1, lr
+; V7M-NEXT: lsr.w r0, r0, r3
+; V7M-NEXT: orr.w r0, r0, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r2
+; V7M-NEXT: lsr.w r1, r1, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_d0:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsb lr, r2, #64
+; V7A-NEXT: rsbs r2, r2, #32
+; V7A-NEXT: rsb r12, lr, #32
+; V7A-NEXT: lsr r3, r0, r12
+; V7A-NEXT: orr r1, r3, r1, lsl lr
+; V7A-NEXT: lslpl r1, r0, r2
+; V7A-NEXT: lsl r0, r0, lr
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lsr r0, r0, lr
+; V7A-NEXT: orr r0, r0, r1, lsl r12
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: lsr r1, r1, lr
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_d0:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #64
+; V7A-T-NEXT: rsbs.w r2, r2, #32
+; V7A-T-NEXT: rsb.w lr, r3, #32
+; V7A-T-NEXT: lsl.w r12, r1, r3
+; V7A-T-NEXT: lsr.w r1, r0, lr
+; V7A-T-NEXT: orr.w r1, r1, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r1, r0, r2
+; V7A-T-NEXT: lsl.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsl.w r12, r1, lr
+; V7A-T-NEXT: lsr.w r0, r0, r3
+; V7A-T-NEXT: orr.w r0, r0, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: lsr.w r1, r1, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_d0:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: movs r3, #64
+; V6M-NEXT: subs r4, r3, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, pc}
+ %numhighbits = sub i64 64, %numlowbits
+ %highbitscleared = shl i64 %val, %numhighbits
+ %masked = lshr i64 %highbitscleared, %numhighbits
+ ret i64 %masked
+}
+
+define i64 @bzhi64_d1_indexzext(i64 %val, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_d1_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r2, r2, #64
+; V7M-NEXT: uxtb r2, r2
+; V7M-NEXT: rsb.w r3, r2, #32
+; V7M-NEXT: lsl.w r12, r1, r2
+; V7M-NEXT: lsr.w r1, r0, r3
+; V7M-NEXT: orr.w r1, r1, r12
+; V7M-NEXT: subs.w r12, r2, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r1, r0, r12
+; V7M-NEXT: lsl.w r0, r0, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsl.w r3, r1, r3
+; V7M-NEXT: lsr.w r0, r0, r2
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r1, r12
+; V7M-NEXT: lsr.w r1, r1, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_d1_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r11, lr}
+; V7A-NEXT: push {r11, lr}
+; V7A-NEXT: rsb lr, r2, #64
+; V7A-NEXT: uxtb r3, lr
+; V7A-NEXT: rsb r12, r3, #32
+; V7A-NEXT: lsr r2, r0, r12
+; V7A-NEXT: orr r1, r2, r1, lsl r3
+; V7A-NEXT: mvn r2, #31
+; V7A-NEXT: uxtab r2, r2, lr
+; V7A-NEXT: cmp r2, #0
+; V7A-NEXT: lslpl r1, r0, r2
+; V7A-NEXT: lsl r0, r0, r3
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lsr r0, r0, r3
+; V7A-NEXT: orr r0, r0, r1, lsl r12
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: lsr r1, r1, r3
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_d1_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: rsb.w r4, r2, #64
+; V7A-T-NEXT: mvn r2, #31
+; V7A-T-NEXT: uxtb r3, r4
+; V7A-T-NEXT: rsb.w lr, r3, #32
+; V7A-T-NEXT: lsl.w r12, r1, r3
+; V7A-T-NEXT: uxtab r2, r2, r4
+; V7A-T-NEXT: lsr.w r1, r0, lr
+; V7A-T-NEXT: cmp r2, #0
+; V7A-T-NEXT: orr.w r1, r1, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r1, r0, r2
+; V7A-T-NEXT: lsl.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsl.w r4, r1, lr
+; V7A-T-NEXT: lsr.w r0, r0, r3
+; V7A-T-NEXT: orr.w r0, r0, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: lsr.w r1, r1, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bzhi64_d1_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: movs r3, #64
+; V6M-NEXT: subs r2, r3, r2
+; V6M-NEXT: uxtb r4, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, pc}
+ %numhighbits = sub i8 64, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i64
+ %highbitscleared = shl i64 %val, %sh_prom
+ %masked = lshr i64 %highbitscleared, %sh_prom
+ ret i64 %masked
+}
+
+define i64 @bzhi64_d2_load(ptr %w, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_d2_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: .save {r7, lr}
+; V7M-NEXT: push {r7, lr}
+; V7M-NEXT: rsb.w r1, r2, #64
+; V7M-NEXT: ldrd r0, r3, [r0]
+; V7M-NEXT: rsb.w lr, r1, #32
+; V7M-NEXT: rsbs.w r2, r2, #32
+; V7M-NEXT: lsl.w r12, r3, r1
+; V7M-NEXT: lsr.w r3, r0, lr
+; V7M-NEXT: orr.w r3, r3, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r3, r0, r2
+; V7M-NEXT: lsl.w r0, r0, r1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsl.w r12, r3, lr
+; V7M-NEXT: lsr.w r0, r0, r1
+; V7M-NEXT: lsr.w r1, r3, r1
+; V7M-NEXT: orr.w r0, r0, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r3, r2
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_d2_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r5, r7, r11, lr}
+; V7A-NEXT: push {r5, r7, r11, lr}
+; V7A-NEXT: rsb r3, r2, #64
+; V7A-NEXT: ldm r0, {r0, r7}
+; V7A-NEXT: rsb r1, r3, #32
+; V7A-NEXT: rsbs r2, r2, #32
+; V7A-NEXT: lsr r5, r0, r1
+; V7A-NEXT: orr r7, r5, r7, lsl r3
+; V7A-NEXT: lslpl r7, r0, r2
+; V7A-NEXT: lsl r0, r0, r3
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lsr r0, r0, r3
+; V7A-NEXT: orr r0, r0, r7, lsl r1
+; V7A-NEXT: lsr r1, r7, r3
+; V7A-NEXT: lsrpl r0, r7, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: pop {r5, r7, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_d2_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r7, lr}
+; V7A-T-NEXT: push {r7, lr}
+; V7A-T-NEXT: rsb.w r3, r2, #64
+; V7A-T-NEXT: ldrd r0, r1, [r0]
+; V7A-T-NEXT: rsb.w lr, r3, #32
+; V7A-T-NEXT: rsbs.w r2, r2, #32
+; V7A-T-NEXT: lsl.w r12, r1, r3
+; V7A-T-NEXT: lsr.w r1, r0, lr
+; V7A-T-NEXT: orr.w r1, r1, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r1, r0, r2
+; V7A-T-NEXT: lsl.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsl.w r12, r1, lr
+; V7A-T-NEXT: lsr.w r0, r0, r3
+; V7A-T-NEXT: orr.w r0, r0, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r1, r2
+; V7A-T-NEXT: lsr.w r1, r1, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_d2_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: movs r1, #64
+; V6M-NEXT: subs r4, r1, r2
+; V6M-NEXT: ldr r2, [r0]
+; V6M-NEXT: ldr r1, [r0, #4]
+; V6M-NEXT: mov r0, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, pc}
+ %val = load i64, ptr %w
+ %numhighbits = sub i64 64, %numlowbits
+ %highbitscleared = shl i64 %val, %numhighbits
+ %masked = lshr i64 %highbitscleared, %numhighbits
+ ret i64 %masked
+}
+
+define i64 @bzhi64_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_d3_load_indexzext:
+; V7M: @ %bb.0:
+; V7M-NEXT: rsb.w r1, r1, #64
+; V7M-NEXT: ldrd r0, r2, [r0]
+; V7M-NEXT: uxtb r1, r1
+; V7M-NEXT: rsb.w r3, r1, #32
+; V7M-NEXT: lsl.w r12, r2, r1
+; V7M-NEXT: lsr.w r2, r0, r3
+; V7M-NEXT: orr.w r2, r2, r12
+; V7M-NEXT: subs.w r12, r1, #32
+; V7M-NEXT: it pl
+; V7M-NEXT: lslpl.w r2, r0, r12
+; V7M-NEXT: lsl.w r0, r0, r1
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsl.w r3, r2, r3
+; V7M-NEXT: lsr.w r0, r0, r1
+; V7M-NEXT: lsr.w r1, r2, r1
+; V7M-NEXT: orr.w r0, r0, r3
+; V7M-NEXT: it pl
+; V7M-NEXT: lsrpl.w r0, r2, r12
+; V7M-NEXT: it pl
+; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_d3_load_indexzext:
+; V7A: @ %bb.0:
+; V7A-NEXT: .save {r5, r7, r11, lr}
+; V7A-NEXT: push {r5, r7, r11, lr}
+; V7A-NEXT: rsb r1, r1, #64
+; V7A-NEXT: ldm r0, {r0, r7}
+; V7A-NEXT: uxtb r2, r1
+; V7A-NEXT: rsb r3, r2, #32
+; V7A-NEXT: lsr r5, r0, r3
+; V7A-NEXT: orr r7, r5, r7, lsl r2
+; V7A-NEXT: mvn r5, #31
+; V7A-NEXT: uxtab r1, r5, r1
+; V7A-NEXT: cmp r1, #0
+; V7A-NEXT: lslpl r7, r0, r1
+; V7A-NEXT: lsl r0, r0, r2
+; V7A-NEXT: movwpl r0, #0
+; V7A-NEXT: lsr r0, r0, r2
+; V7A-NEXT: orr r0, r0, r7, lsl r3
+; V7A-NEXT: lsrpl r0, r7, r1
+; V7A-NEXT: lsr r1, r7, r2
+; V7A-NEXT: movwpl r1, #0
+; V7A-NEXT: pop {r5, r7, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_d3_load_indexzext:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: .save {r4, lr}
+; V7A-T-NEXT: push {r4, lr}
+; V7A-T-NEXT: rsb.w r4, r1, #64
+; V7A-T-NEXT: ldrd r0, r2, [r0]
+; V7A-T-NEXT: mvn r1, #31
+; V7A-T-NEXT: uxtb r3, r4
+; V7A-T-NEXT: rsb.w lr, r3, #32
+; V7A-T-NEXT: lsl.w r12, r2, r3
+; V7A-T-NEXT: uxtab r1, r1, r4
+; V7A-T-NEXT: lsr.w r2, r0, lr
+; V7A-T-NEXT: cmp r1, #0
+; V7A-T-NEXT: orr.w r2, r2, r12
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lslpl.w r2, r0, r1
+; V7A-T-NEXT: lsl.w r0, r0, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r0, #0
+; V7A-T-NEXT: lsl.w r4, r2, lr
+; V7A-T-NEXT: lsr.w r0, r0, r3
+; V7A-T-NEXT: orr.w r0, r0, r4
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: lsrpl.w r0, r2, r1
+; V7A-T-NEXT: lsr.w r1, r2, r3
+; V7A-T-NEXT: it pl
+; V7A-T-NEXT: movpl r1, #0
+; V7A-T-NEXT: pop {r4, pc}
+;
+; V6M-LABEL: bzhi64_d3_load_indexzext:
+; V6M: @ %bb.0:
+; V6M-NEXT: .save {r4, lr}
+; V6M-NEXT: push {r4, lr}
+; V6M-NEXT: movs r2, #64
+; V6M-NEXT: subs r1, r2, r1
+; V6M-NEXT: uxtb r4, r1
+; V6M-NEXT: ldr r2, [r0]
+; V6M-NEXT: ldr r1, [r0, #4]
+; V6M-NEXT: mov r0, r2
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsl
+; V6M-NEXT: mov r2, r4
+; V6M-NEXT: bl __aeabi_llsr
+; V6M-NEXT: pop {r4, pc}
+ %val = load i64, ptr %w
+ %numhighbits = sub i8 64, %numlowbits
+ %sh_prom = zext i8 %numhighbits to i64
+ %highbitscleared = shl i64 %val, %sh_prom
+ %masked = lshr i64 %highbitscleared, %sh_prom
+ ret i64 %masked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Constant mask
+; ---------------------------------------------------------------------------- ;
+
+; 32-bit
+
+define i32 @bzhi32_constant_mask32(i32 %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask32:
+; V7M: @ %bb.0:
+; V7M-NEXT: bic r0, r0, #-2147483648
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask32:
+; V7A: @ %bb.0:
+; V7A-NEXT: bic r0, r0, #-2147483648
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask32:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: bic r0, r0, #-2147483648
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask32:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r1, #1
+; V6M-NEXT: lsls r1, r1, #31
+; V6M-NEXT: bics r0, r1
+; V6M-NEXT: bx lr
+ %masked = and i32 %val, 2147483647
+ ret i32 %masked
+}
+
+define i32 @bzhi32_constant_mask32_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask32_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: bic r0, r0, #-2147483648
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask32_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: bic r0, r0, #-2147483648
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask32_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: bic r0, r0, #-2147483648
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask32_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r1, #1
+; V6M-NEXT: lsls r1, r1, #31
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: bics r0, r1
+; V6M-NEXT: bx lr
+ %val1 = load i32, ptr %val
+ %masked = and i32 %val1, 2147483647
+ ret i32 %masked
+}
+
+define i32 @bzhi32_constant_mask16(i32 %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask16:
+; V7M: @ %bb.0:
+; V7M-NEXT: bfc r0, #15, #17
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask16:
+; V7A: @ %bb.0:
+; V7A-NEXT: bfc r0, #15, #17
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask16:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: bfc r0, #15, #17
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask16:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r1, .LCPI41_0
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI41_0:
+; V6M-NEXT: .long 32767 @ 0x7fff
+ %masked = and i32 %val, 32767
+ ret i32 %masked
+}
+
+define i32 @bzhi32_constant_mask16_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask16_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: bfc r0, #15, #17
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask16_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: bfc r0, #15, #17
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask16_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: bfc r0, #15, #17
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask16_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r1, [r0]
+; V6M-NEXT: ldr r0, .LCPI42_0
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI42_0:
+; V6M-NEXT: .long 32767 @ 0x7fff
+ %val1 = load i32, ptr %val
+ %masked = and i32 %val1, 32767
+ ret i32 %masked
+}
+
+define i32 @bzhi32_constant_mask8(i32 %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask8:
+; V7M: @ %bb.0:
+; V7M-NEXT: and r0, r0, #127
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask8:
+; V7A: @ %bb.0:
+; V7A-NEXT: and r0, r0, #127
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask8:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: and r0, r0, #127
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask8:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r1, #127
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %masked = and i32 %val, 127
+ ret i32 %masked
+}
+
+define i32 @bzhi32_constant_mask8_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask8_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: and r0, r0, #127
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask8_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: and r0, r0, #127
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask8_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: and r0, r0, #127
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask8_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r1, [r0]
+; V6M-NEXT: movs r0, #127
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: bx lr
+ %val1 = load i32, ptr %val
+ %masked = and i32 %val1, 127
+ ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bzhi64_constant_mask64(i64 %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask64:
+; V7M: @ %bb.0:
+; V7M-NEXT: bic r1, r1, #-1073741824
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask64:
+; V7A: @ %bb.0:
+; V7A-NEXT: bic r1, r1, #-1073741824
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask64:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: bic r1, r1, #-1073741824
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask64:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r2, #3
+; V6M-NEXT: lsls r2, r2, #30
+; V6M-NEXT: bics r1, r2
+; V6M-NEXT: bx lr
+ %masked = and i64 %val, 4611686018427387903
+ ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask64_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask64_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldrd r0, r1, [r0]
+; V7M-NEXT: bic r1, r1, #-1073741824
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask64_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldrd r0, r1, [r0]
+; V7A-NEXT: bic r1, r1, #-1073741824
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask64_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldrd r0, r1, [r0]
+; V7A-T-NEXT: bic r1, r1, #-1073741824
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask64_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r1, #3
+; V6M-NEXT: lsls r3, r1, #30
+; V6M-NEXT: ldr r2, [r0]
+; V6M-NEXT: ldr r1, [r0, #4]
+; V6M-NEXT: bics r1, r3
+; V6M-NEXT: mov r0, r2
+; V6M-NEXT: bx lr
+ %val1 = load i64, ptr %val
+ %masked = and i64 %val1, 4611686018427387903
+ ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask32(i64 %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask32:
+; V7M: @ %bb.0:
+; V7M-NEXT: bic r0, r0, #-2147483648
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask32:
+; V7A: @ %bb.0:
+; V7A-NEXT: bic r0, r0, #-2147483648
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask32:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: bic r0, r0, #-2147483648
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask32:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r1, #1
+; V6M-NEXT: lsls r1, r1, #31
+; V6M-NEXT: bics r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+ %masked = and i64 %val, 2147483647
+ ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask32_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask32_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bic r0, r0, #-2147483648
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask32_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bic r0, r0, #-2147483648
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask32_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bic r0, r0, #-2147483648
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask32_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r1, #1
+; V6M-NEXT: lsls r1, r1, #31
+; V6M-NEXT: ldr r0, [r0]
+; V6M-NEXT: bics r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+ %val1 = load i64, ptr %val
+ %masked = and i64 %val1, 2147483647
+ ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask16(i64 %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask16:
+; V7M: @ %bb.0:
+; V7M-NEXT: bfc r0, #15, #17
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask16:
+; V7A: @ %bb.0:
+; V7A-NEXT: bfc r0, #15, #17
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask16:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: bfc r0, #15, #17
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask16:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r1, .LCPI49_0
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI49_0:
+; V6M-NEXT: .long 32767 @ 0x7fff
+ %masked = and i64 %val, 32767
+ ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask16_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask16_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bfc r0, #15, #17
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask16_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bfc r0, #15, #17
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask16_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bfc r0, #15, #17
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask16_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r1, [r0]
+; V6M-NEXT: ldr r0, .LCPI50_0
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+; V6M-NEXT: .p2align 2
+; V6M-NEXT: @ %bb.1:
+; V6M-NEXT: .LCPI50_0:
+; V6M-NEXT: .long 32767 @ 0x7fff
+ %val1 = load i64, ptr %val
+ %masked = and i64 %val1, 32767
+ ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask8(i64 %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask8:
+; V7M: @ %bb.0:
+; V7M-NEXT: and r0, r0, #127
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask8:
+; V7A: @ %bb.0:
+; V7A-NEXT: and r0, r0, #127
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask8:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: and r0, r0, #127
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask8:
+; V6M: @ %bb.0:
+; V6M-NEXT: movs r1, #127
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+ %masked = and i64 %val, 127
+ ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask8_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask8_load:
+; V7M: @ %bb.0:
+; V7M-NEXT: ldr r0, [r0]
+; V7M-NEXT: movs r1, #0
+; V7M-NEXT: and r0, r0, #127
+; V7M-NEXT: bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask8_load:
+; V7A: @ %bb.0:
+; V7A-NEXT: ldr r0, [r0]
+; V7A-NEXT: mov r1, #0
+; V7A-NEXT: and r0, r0, #127
+; V7A-NEXT: bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask8_load:
+; V7A-T: @ %bb.0:
+; V7A-T-NEXT: ldr r0, [r0]
+; V7A-T-NEXT: movs r1, #0
+; V7A-T-NEXT: and r0, r0, #127
+; V7A-T-NEXT: bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask8_load:
+; V6M: @ %bb.0:
+; V6M-NEXT: ldr r1, [r0]
+; V6M-NEXT: movs r0, #127
+; V6M-NEXT: ands r0, r1
+; V6M-NEXT: movs r1, #0
+; V6M-NEXT: bx lr
+ %val1 = load i64, ptr %val
+ %masked = and i64 %val1, 127
+ ret i64 %masked
+}
diff --git a/llvm/test/CodeGen/ARM/llrint-conv.ll b/llvm/test/CodeGen/ARM/llrint-conv.ll
index 749ee00..a1a04db 100644
--- a/llvm/test/CodeGen/ARM/llrint-conv.ll
+++ b/llvm/test/CodeGen/ARM/llrint-conv.ll
@@ -1,46 +1,71 @@
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft | FileCheck %s --check-prefix=SOFTFP
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=hard | FileCheck %s --check-prefix=HARDFP
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
+; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16
+; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
-; SOFTFP-LABEL: testmsxh_builtin:
-; SOFTFP: bl llrintf
-; HARDFP-LABEL: testmsxh_builtin:
-; HARDFP: bl llrintf
define i64 @testmsxh_builtin(half %x) {
+; CHECK-SOFT-LABEL: testmsxh_builtin:
+; CHECK-SOFT: @ %bb.0: @ %entry
+; CHECK-SOFT-NEXT: .save {r11, lr}
+; CHECK-SOFT-NEXT: push {r11, lr}
+; CHECK-SOFT-NEXT: bl __aeabi_h2f
+; CHECK-SOFT-NEXT: bl llrintf
+; CHECK-SOFT-NEXT: pop {r11, pc}
+;
+; CHECK-NOFP16-LABEL: testmsxh_builtin:
+; CHECK-NOFP16: @ %bb.0: @ %entry
+; CHECK-NOFP16-NEXT: .save {r11, lr}
+; CHECK-NOFP16-NEXT: push {r11, lr}
+; CHECK-NOFP16-NEXT: vmov r0, s0
+; CHECK-NOFP16-NEXT: bl __aeabi_h2f
+; CHECK-NOFP16-NEXT: vmov s0, r0
+; CHECK-NOFP16-NEXT: bl llrintf
+; CHECK-NOFP16-NEXT: pop {r11, pc}
+;
+; CHECK-FP16-LABEL: testmsxh_builtin:
+; CHECK-FP16: @ %bb.0: @ %entry
+; CHECK-FP16-NEXT: .save {r11, lr}
+; CHECK-FP16-NEXT: push {r11, lr}
+; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT: bl llrintf
+; CHECK-FP16-NEXT: pop {r11, pc}
entry:
%0 = tail call i64 @llvm.llrint.i64.f16(half %x)
ret i64 %0
}
-; SOFTFP-LABEL: testmsxs_builtin:
-; SOFTFP: bl llrintf
-; HARDFP-LABEL: testmsxs_builtin:
-; HARDFP: bl llrintf
define i64 @testmsxs_builtin(float %x) {
+; CHECK-LABEL: testmsxs_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bl llrintf
+; CHECK-NEXT: pop {r11, pc}
entry:
%0 = tail call i64 @llvm.llrint.i64.f32(float %x)
ret i64 %0
}
-; SOFTFP-LABEL: testmsxd_builtin:
-; SOFTFP: bl llrint
-; HARDFP-LABEL: testmsxd_builtin:
-; HARDFP: bl llrint
define i64 @testmsxd_builtin(double %x) {
+; CHECK-LABEL: testmsxd_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bl llrint
+; CHECK-NEXT: pop {r11, pc}
entry:
%0 = tail call i64 @llvm.llrint.i64.f64(double %x)
ret i64 %0
}
-; FIXME(#44744): incorrect libcall
-; SOFTFP-LABEL: testmsxq_builtin:
-; SOFTFP: bl llrintl
-; HARDFP-LABEL: testmsxq_builtin:
-; HARDFP: bl llrintl
define i64 @testmsxq_builtin(fp128 %x) {
+; CHECK-LABEL: testmsxq_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bl llrintl
+; CHECK-NEXT: pop {r11, pc}
entry:
%0 = tail call i64 @llvm.llrint.i64.f128(fp128 %x)
ret i64 %0
}
-
-declare i64 @llvm.llrint.i64.f32(float) nounwind readnone
-declare i64 @llvm.llrint.i64.f64(double) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/lrint-conv.ll b/llvm/test/CodeGen/ARM/lrint-conv.ll
index 9aa9511..23a2685 100644
--- a/llvm/test/CodeGen/ARM/lrint-conv.ll
+++ b/llvm/test/CodeGen/ARM/lrint-conv.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft | FileCheck %s --check-prefix=SOFTFP
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=hard | FileCheck %s --check-prefix=HARDFP
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
+; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16
+; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
; FIXME: crash
; define i32 @testmswh_builtin(half %x) {
@@ -8,36 +10,37 @@
; ret i32 %0
; }
-; SOFTFP-LABEL: testmsws_builtin:
-; SOFTFP: bl lrintf
-; HARDFP-LABEL: testmsws_builtin:
-; HARDFP: bl lrintf
define i32 @testmsws_builtin(float %x) {
+; CHECK-LABEL: testmsws_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: b lrintf
entry:
%0 = tail call i32 @llvm.lrint.i32.f32(float %x)
ret i32 %0
}
-; SOFTFP-LABEL: testmswd_builtin:
-; SOFTFP: bl lrint
-; HARDFP-LABEL: testmswd_builtin:
-; HARDFP: bl lrint
define i32 @testmswd_builtin(double %x) {
+; CHECK-LABEL: testmswd_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: b lrint
entry:
%0 = tail call i32 @llvm.lrint.i32.f64(double %x)
ret i32 %0
}
-; FIXME(#44744): incorrect libcall
-; SOFTFP-LABEL: testmswq_builtin:
-; SOFTFP: bl lrintl
-; HARDFP-LABEL: testmswq_builtin:
-; HARDFP: bl lrintl
define i32 @testmswq_builtin(fp128 %x) {
+; CHECK-LABEL: testmswq_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bl lrintl
+; CHECK-NEXT: pop {r11, pc}
entry:
%0 = tail call i32 @llvm.lrint.i32.f128(fp128 %x)
ret i32 %0
}
-declare i32 @llvm.lrint.i32.f32(float) nounwind readnone
-declare i32 @llvm.lrint.i32.f64(double) nounwind readnone
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-FP16: {{.*}}
+; CHECK-NOFP16: {{.*}}
+; CHECK-SOFT: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/vector-lrint.ll b/llvm/test/CodeGen/ARM/vector-lrint.ll
index fe5e3cb..c1159da 100644
--- a/llvm/test/CodeGen/ARM/vector-lrint.ll
+++ b/llvm/test/CodeGen/ARM/vector-lrint.ll
@@ -14,31 +14,26 @@
; %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f16(<1 x half> %x)
; ret <1 x iXLen> %a
; }
-; declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f16(<1 x half>)
; define <2 x iXLen> @lrint_v2f16(<2 x half> %x) {
; %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f16(<2 x half> %x)
; ret <2 x iXLen> %a
; }
-; declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f16(<2 x half>)
; define <4 x iXLen> @lrint_v4f16(<4 x half> %x) {
; %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f16(<4 x half> %x)
; ret <4 x iXLen> %a
; }
-; declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f16(<4 x half>)
; define <8 x iXLen> @lrint_v8f16(<8 x half> %x) {
; %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f16(<8 x half> %x)
; ret <8 x iXLen> %a
; }
-; declare <8 x iXLen> @llvm.lrint.v8iXLen.v8f16(<8 x half>)
; define <16 x iXLen> @lrint_v16f16(<16 x half> %x) {
; %a = call <16 x iXLen> @llvm.lrint.v16iXLen.v16f16(<16 x half> %x)
; ret <16 x iXLen> %a
; }
-; declare <16 x iXLen> @llvm.lrint.v16iXLen.v16f16(<16 x half>)
define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
; LE-I32-LABEL: lrint_v1f32:
@@ -76,7 +71,6 @@ define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
%a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float> %x)
ret <1 x iXLen> %a
}
-declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float>)
define <2 x iXLen> @lrint_v2f32(<2 x float> %x) {
; LE-I32-LABEL: lrint_v2f32:
@@ -160,7 +154,6 @@ define <2 x iXLen> @lrint_v2f32(<2 x float> %x) {
%a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float> %x)
ret <2 x iXLen> %a
}
-declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float>)
define <4 x iXLen> @lrint_v4f32(<4 x float> %x) {
; LE-I32-LABEL: lrint_v4f32:
@@ -274,7 +267,6 @@ define <4 x iXLen> @lrint_v4f32(<4 x float> %x) {
%a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float> %x)
ret <4 x iXLen> %a
}
-declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float>)
define <8 x iXLen> @lrint_v8f32(<8 x float> %x) {
; LE-I32-LABEL: lrint_v8f32:
@@ -488,7 +480,6 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x) {
%a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f32(<8 x float> %x)
ret <8 x iXLen> %a
}
-declare <8 x iXLen> @llvm.lrint.v8iXLen.v8f32(<8 x float>)
define <16 x iXLen> @lrint_v16f32(<16 x float> %x) {
; LE-I32-LABEL: lrint_v16f32:
@@ -1005,7 +996,6 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) {
%a = call <16 x iXLen> @llvm.lrint.v16iXLen.v16f32(<16 x float> %x)
ret <16 x iXLen> %a
}
-declare <16 x iXLen> @llvm.lrint.v16iXLen.v16f32(<16 x float>)
define <1 x iXLen> @lrint_v1f64(<1 x double> %x) {
; LE-I32-LABEL: lrint_v1f64:
@@ -1043,7 +1033,6 @@ define <1 x iXLen> @lrint_v1f64(<1 x double> %x) {
%a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f64(<1 x double> %x)
ret <1 x iXLen> %a
}
-declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f64(<1 x double>)
define <2 x iXLen> @lrint_v2f64(<2 x double> %x) {
; LE-I32-LABEL: lrint_v2f64:
@@ -1120,7 +1109,6 @@ define <2 x iXLen> @lrint_v2f64(<2 x double> %x) {
%a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f64(<2 x double> %x)
ret <2 x iXLen> %a
}
-declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f64(<2 x double>)
define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
; LE-I32-LABEL: lrint_v4f64:
@@ -1237,7 +1225,6 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
%a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f64(<4 x double> %x)
ret <4 x iXLen> %a
}
-declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f64(<4 x double>)
define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
; LE-I32-LABEL: lrint_v8f64:
@@ -1467,7 +1454,6 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
%a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f64(<8 x double> %x)
ret <8 x iXLen> %a
}
-declare <8 x iXLen> @llvm.lrint.v8iXLen.v8f64(<8 x double>)
define <16 x iXLen> @lrint_v16f64(<16 x double> %x) {
; LE-I32-LABEL: lrint_v16f64:
@@ -2053,7 +2039,6 @@ define <16 x iXLen> @lrint_v16f64(<16 x double> %x) {
%a = call <16 x iXLen> @llvm.lrint.v16iXLen.v16f64(<16 x double> %x)
ret <16 x iXLen> %a
}
-declare <16 x iXLen> @llvm.lrint.v16iXLen.v16f64(<16 x double>)
define <1 x iXLen> @lrint_v1fp128(<1 x fp128> %x) {
; LE-I32-LABEL: lrint_v1fp128:
@@ -2091,7 +2076,6 @@ define <1 x iXLen> @lrint_v1fp128(<1 x fp128> %x) {
%a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1fp128(<1 x fp128> %x)
ret <1 x iXLen> %a
}
-declare <1 x iXLen> @llvm.lrint.v1iXLen.v1fp128(<1 x fp128>)
define <2 x iXLen> @lrint_v2fp128(<2 x fp128> %x) {
; LE-I32-LABEL: lrint_v2fp128:
@@ -2194,7 +2178,6 @@ define <2 x iXLen> @lrint_v2fp128(<2 x fp128> %x) {
%a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2fp128(<2 x fp128> %x)
ret <2 x iXLen> %a
}
-declare <2 x iXLen> @llvm.lrint.v2iXLen.v2fp128(<2 x fp128>)
define <4 x iXLen> @lrint_v4fp128(<4 x fp128> %x) {
; LE-I32-LABEL: lrint_v4fp128:
@@ -2347,7 +2330,6 @@ define <4 x iXLen> @lrint_v4fp128(<4 x fp128> %x) {
%a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4fp128(<4 x fp128> %x)
ret <4 x iXLen> %a
}
-declare <4 x iXLen> @llvm.lrint.v4iXLen.v4fp128(<4 x fp128>)
define <8 x iXLen> @lrint_v8fp128(<8 x fp128> %x) {
; LE-I32-LABEL: lrint_v8fp128:
@@ -2664,7 +2646,6 @@ define <8 x iXLen> @lrint_v8fp128(<8 x fp128> %x) {
%a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8fp128(<8 x fp128> %x)
ret <8 x iXLen> %a
}
-declare <8 x iXLen> @llvm.lrint.v8iXLen.v8fp128(<8 x fp128>)
define <16 x iXLen> @lrint_v16fp128(<16 x fp128> %x) {
; LE-I32-LABEL: lrint_v16fp128:
@@ -3262,4 +3243,3 @@ define <16 x iXLen> @lrint_v16fp128(<16 x fp128> %x) {
%a = call <16 x iXLen> @llvm.lrint.v16iXLen.v16fp128(<16 x fp128> %x)
ret <16 x iXLen> %a
}
-declare <16 x iXLen> @llvm.lrint.v16iXLen.v16fp128(<16 x fp128>)
diff --git a/llvm/test/CodeGen/Hexagon/unaligned-vec-store.ll b/llvm/test/CodeGen/Hexagon/unaligned-vec-store.ll
new file mode 100644
index 0000000..267e365
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/unaligned-vec-store.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128B < %s | FileCheck %s
+; REQUIRES: asserts
+
+; Check that the test does not assert when unaligned vector store V6_vS32Ub_npred_ai is generated.
+; CHECK: if (!p{{[0-3]}}) vmemu
+
+target triple = "hexagon-unknown-unknown-elf"
+
+define fastcc void @test(i1 %cmp.i.i) {
+entry:
+ %call.i.i.i172 = load ptr, ptr null, align 4
+ %add.ptr = getelementptr i8, ptr %call.i.i.i172, i32 1
+ store <32 x i32> zeroinitializer, ptr %add.ptr, align 128
+ %add.ptr4.i4 = getelementptr i8, ptr %call.i.i.i172, i32 129
+ br i1 %cmp.i.i, label %common.ret, label %if.end.i.i
+
+common.ret: ; preds = %if.end.i.i, %entry
+ ret void
+
+if.end.i.i: ; preds = %entry
+ store <32 x i32> zeroinitializer, ptr %add.ptr4.i4, align 1
+ br label %common.ret
+}
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
index 18fb879..21ca041 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
@@ -115,5 +115,150 @@ define ptx_kernel void @inlineasm(ptr %p) {
store <2 x float> %mul, ptr %p, align 8
ret void
}
+
+define ptx_kernel void @trunc_v2i32(<2 x i32> %0) {
+; CHECK-SM90A-LABEL: trunc_v2i32(
+; CHECK-SM90A: {
+; CHECK-SM90A-NEXT: .reg .b32 %r<7>;
+; CHECK-SM90A-NEXT: .reg .b64 %rd<2>;
+; CHECK-SM90A-EMPTY:
+; CHECK-SM90A-NEXT: // %bb.0:
+; CHECK-SM90A-NEXT: ld.param.v2.b32 {%r1, %r2}, [trunc_v2i32_param_0];
+; CHECK-SM90A-NEXT: prmt.b32 %r3, %r1, %r2, 0x3340U;
+; CHECK-SM90A-NEXT: mov.b32 %r4, 0;
+; CHECK-SM90A-NEXT: prmt.b32 %r5, %r4, 0, 0x3340U;
+; CHECK-SM90A-NEXT: prmt.b32 %r6, %r5, %r3, 0x5410U;
+; CHECK-SM90A-NEXT: mov.b64 %rd1, 0;
+; CHECK-SM90A-NEXT: st.b32 [%rd1], %r6;
+; CHECK-SM90A-NEXT: ret;
+;
+; CHECK-SM100-LABEL: trunc_v2i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<7>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<3>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.b64 %rd1, [trunc_v2i32_param_0];
+; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-SM100-NEXT: mov.b32 %r3, 0;
+; CHECK-SM100-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U;
+; CHECK-SM100-NEXT: prmt.b32 %r5, %r1, %r2, 0x3340U;
+; CHECK-SM100-NEXT: prmt.b32 %r6, %r4, %r5, 0x5410U;
+; CHECK-SM100-NEXT: mov.b64 %rd2, 0;
+; CHECK-SM100-NEXT: st.b32 [%rd2], %r6;
+; CHECK-SM100-NEXT: ret;
+ %2 = trunc <2 x i32> %0 to <2 x i8>
+ %3 = shufflevector <2 x i8> zeroinitializer, <2 x i8> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ store <4 x i8> %3, ptr null, align 4
+ ret void
+}
+
+define ptx_kernel void @zextend_to_v2i32(<2 x i8> %0) {
+; CHECK-SM90A-LABEL: zextend_to_v2i32(
+; CHECK-SM90A: {
+; CHECK-SM90A-NEXT: .reg .b16 %rs<3>;
+; CHECK-SM90A-NEXT: .reg .b32 %r<4>;
+; CHECK-SM90A-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM90A-EMPTY:
+; CHECK-SM90A-NEXT: // %bb.0:
+; CHECK-SM90A-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0];
+; CHECK-SM90A-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-SM90A-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-SM90A-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-SM90A-NEXT: mov.b64 %rd1, 12;
+; CHECK-SM90A-NEXT: st.b32 [%rd1], %r3;
+; CHECK-SM90A-NEXT: mov.b64 %rd2, 8;
+; CHECK-SM90A-NEXT: st.b32 [%rd2], %r2;
+; CHECK-SM90A-NEXT: mov.b64 %rd3, 4;
+; CHECK-SM90A-NEXT: st.b32 [%rd3], 0;
+; CHECK-SM90A-NEXT: mov.b64 %rd4, 0;
+; CHECK-SM90A-NEXT: st.b32 [%rd4], 0;
+; CHECK-SM90A-NEXT: ret;
+;
+; CHECK-SM100-LABEL: zextend_to_v2i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b16 %rs<3>;
+; CHECK-SM100-NEXT: .reg .b32 %r<5>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<8>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0];
+; CHECK-SM100-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-SM100-NEXT: cvt.u32.u16 %r2, %rs2;
+; CHECK-SM100-NEXT: cvt.u32.u16 %r3, %rs1;
+; CHECK-SM100-NEXT: mov.b64 %rd1, {%r3, %r2};
+; CHECK-SM100-NEXT: mov.b32 %r4, 0;
+; CHECK-SM100-NEXT: mov.b64 %rd2, {%r4, %r4};
+; CHECK-SM100-NEXT: mov.b64 %rd3, 4;
+; CHECK-SM100-NEXT: st.b32 [%rd3], %rd2;
+; CHECK-SM100-NEXT: mov.b64 %rd4, 0;
+; CHECK-SM100-NEXT: st.b32 [%rd4], %rd2;
+; CHECK-SM100-NEXT: mov.b64 %rd5, 8;
+; CHECK-SM100-NEXT: st.b32 [%rd5], %rd1;
+; CHECK-SM100-NEXT: shr.u64 %rd6, %rd1, 32;
+; CHECK-SM100-NEXT: mov.b64 %rd7, 12;
+; CHECK-SM100-NEXT: st.b32 [%rd7], %rd6;
+; CHECK-SM100-NEXT: ret;
+ %2 = zext <2 x i8> %0 to <2 x i32>
+ %3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ store <4 x i32> %3, ptr null, align 4
+ ret void
+}
+
+define ptx_kernel void @sextend_to_v2i32(<2 x i8> %0) {
+; CHECK-SM90A-LABEL: sextend_to_v2i32(
+; CHECK-SM90A: {
+; CHECK-SM90A-NEXT: .reg .b16 %rs<3>;
+; CHECK-SM90A-NEXT: .reg .b32 %r<6>;
+; CHECK-SM90A-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM90A-EMPTY:
+; CHECK-SM90A-NEXT: // %bb.0:
+; CHECK-SM90A-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0];
+; CHECK-SM90A-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-SM90A-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-SM90A-NEXT: cvt.s32.s8 %r3, %r2;
+; CHECK-SM90A-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-SM90A-NEXT: cvt.s32.s8 %r5, %r4;
+; CHECK-SM90A-NEXT: mov.b64 %rd1, 12;
+; CHECK-SM90A-NEXT: st.b32 [%rd1], %r5;
+; CHECK-SM90A-NEXT: mov.b64 %rd2, 8;
+; CHECK-SM90A-NEXT: st.b32 [%rd2], %r3;
+; CHECK-SM90A-NEXT: mov.b64 %rd3, 4;
+; CHECK-SM90A-NEXT: st.b32 [%rd3], 0;
+; CHECK-SM90A-NEXT: mov.b64 %rd4, 0;
+; CHECK-SM90A-NEXT: st.b32 [%rd4], 0;
+; CHECK-SM90A-NEXT: ret;
+;
+; CHECK-SM100-LABEL: sextend_to_v2i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b16 %rs<3>;
+; CHECK-SM100-NEXT: .reg .b32 %r<7>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<8>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0];
+; CHECK-SM100-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-SM100-NEXT: cvt.u32.u16 %r2, %rs2;
+; CHECK-SM100-NEXT: cvt.s32.s8 %r3, %r2;
+; CHECK-SM100-NEXT: cvt.u32.u16 %r4, %rs1;
+; CHECK-SM100-NEXT: cvt.s32.s8 %r5, %r4;
+; CHECK-SM100-NEXT: mov.b64 %rd1, {%r5, %r3};
+; CHECK-SM100-NEXT: mov.b32 %r6, 0;
+; CHECK-SM100-NEXT: mov.b64 %rd2, {%r6, %r6};
+; CHECK-SM100-NEXT: mov.b64 %rd3, 4;
+; CHECK-SM100-NEXT: st.b32 [%rd3], %rd2;
+; CHECK-SM100-NEXT: mov.b64 %rd4, 0;
+; CHECK-SM100-NEXT: st.b32 [%rd4], %rd2;
+; CHECK-SM100-NEXT: mov.b64 %rd5, 8;
+; CHECK-SM100-NEXT: st.b32 [%rd5], %rd1;
+; CHECK-SM100-NEXT: shr.u64 %rd6, %rd1, 32;
+; CHECK-SM100-NEXT: mov.b64 %rd7, 12;
+; CHECK-SM100-NEXT: st.b32 [%rd7], %rd6;
+; CHECK-SM100-NEXT: ret;
+ %2 = sext <2 x i8> %0 to <2 x i32>
+ %3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ store <4 x i32> %3, ptr null, align 4
+ ret void
+}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/remat.ll b/llvm/test/CodeGen/RISCV/rvv/remat.ll
index 06d54fa..95bff27 100644
--- a/llvm/test/CodeGen/RISCV/rvv/remat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/remat.ll
@@ -301,3 +301,135 @@ define void @vfmv.s.f(ptr %p, double %x) {
store volatile double %x, ptr %p
ret void
}
+
+; This test is fairly fragile, but it's trying to cover the case which
+; caused the revert of bba9172 due to interaction with how rematerialize
+; instructions are pruned from the original live interval. In the result
+; below, we remat the vmv.v.x into the loop, but fail to remat the vmv.v.x
+; a second time after further splitting it's live range. We shouldn't need
+; to spill it to the stack at all.
+define i64 @dual_remat(i64 %0, <vscale x 16 x i64> %1, <vscale x 16 x i64> %2, ptr %p) #0 {
+; CHECK-LABEL: dual_remat:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 5
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-NEXT: vmv.v.i v16, 0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: srli a1, a2, 3
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: vmv.v.i v0, 0
+; CHECK-NEXT: .LBB8_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: mv a5, a4
+; CHECK-NEXT: slli a4, a4, 3
+; CHECK-NEXT: add a5, a5, a4
+; CHECK-NEXT: slli a4, a4, 1
+; CHECK-NEXT: add a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a5, a4, 4
+; CHECK-NEXT: add a4, a5, a4
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: mv a5, a4
+; CHECK-NEXT: slli a4, a4, 3
+; CHECK-NEXT: add a5, a5, a4
+; CHECK-NEXT: slli a4, a4, 1
+; CHECK-NEXT: add a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vand.vv v16, v16, v8
+; CHECK-NEXT: vmsne.vi v24, v16, 0
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a4, a4, 4
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs1r.v v24, (a4) # vscale x 8-byte Folded Spill
+; CHECK-NEXT: vand.vv v16, v0, v8
+; CHECK-NEXT: vmsne.vi v8, v16, 0
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: mv a5, a4
+; CHECK-NEXT: slli a4, a4, 3
+; CHECK-NEXT: add a5, a5, a4
+; CHECK-NEXT: slli a4, a4, 1
+; CHECK-NEXT: add a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a4, a4, 4
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl1r.v v9, (a4) # vscale x 8-byte Folded Reload
+; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v8, a1
+; CHECK-NEXT: vsetvli a4, zero, e8, m2, ta, ma
+; CHECK-NEXT: vcpop.m a4, v9
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: slli a6, a5, 4
+; CHECK-NEXT: add a5, a6, a5
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vs8r.v v8, (a3)
+; CHECK-NEXT: vs8r.v v8, (a2)
+; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vsetvli a5, zero, e64, m8, ta, ma
+; CHECK-NEXT: vor.vv v16, v16, v8
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: slli a5, a5, 3
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vor.vv v0, v0, v8
+; CHECK-NEXT: beqz a4, .LBB8_1
+; CHECK-NEXT: # %bb.2: # %middle.block
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 5
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add sp, sp, a1
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+entry:
+ %broadcast.splatinsert = insertelement <vscale x 16 x i64> zeroinitializer, i64 %0, i64 0
+ %broadcast.splat = shufflevector <vscale x 16 x i64> %broadcast.splatinsert, <vscale x 16 x i64> zeroinitializer, <vscale x 16 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %vec.ind = phi <vscale x 16 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
+ %3 = and <vscale x 16 x i64> %vec.ind, %broadcast.splat
+ %4 = icmp ne <vscale x 16 x i64> %3, zeroinitializer
+ store <vscale x 16 x i64> %broadcast.splat, ptr %p
+ %5 = tail call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> %4)
+ %vec.ind.next = or <vscale x 16 x i64> %vec.ind, %1
+ br i1 %5, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %and.i = and i64 1, %0
+ ret i64 %and.i
+}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll
index cd52498..2964da9 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll
@@ -32,6 +32,7 @@
; CHECK-DAG: OpDecorate [[g]] Binding 0
; CHECK-DAG: OpDecorate [[h]] DescriptorSet 10
; CHECK-DAG: OpDecorate [[h]] Binding 3
+; CHECK-NOT: OpDecorate [[h]] Binding 4
; CHECK-DAG: OpDecorate [[i]] DescriptorSet 10
; CHECK-DAG: OpDecorate [[i]] Binding 2
@@ -44,30 +45,34 @@ entry:
%3 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str.6)
%4 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 10, i32 1, i32 1, i32 0, ptr nonnull @.str.8)
%5 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 2, i32 10, i32 1, i32 0, ptr nonnull @.str.10)
- %6 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 3, i32 10, i32 1, i32 0, ptr nonnull @.str.12)
- %7 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 10, i32 2, i32 1, i32 0, ptr nonnull @.str.14)
- %8 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %1, i32 0)
- %9 = load i32, ptr addrspace(11) %8, align 4
- %10 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %2, i32 0)
- %11 = load i32, ptr addrspace(11) %10, align 4
- %add.i = add nsw i32 %11, %9
- %12 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %3, i32 0)
- %13 = load i32, ptr addrspace(11) %12, align 4
- %add4.i = add nsw i32 %add.i, %13
- %14 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %4, i32 0)
- %15 = load i32, ptr addrspace(11) %14, align 4
- %add6.i = add nsw i32 %add4.i, %15
- %16 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %5, i32 0)
- %17 = load i32, ptr addrspace(11) %16, align 4
- %add8.i = add nsw i32 %add6.i, %17
- %18 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %6, i32 0)
- %19 = load i32, ptr addrspace(11) %18, align 4
- %add10.i = add nsw i32 %add8.i, %19
- %20 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %7, i32 0)
- %21 = load i32, ptr addrspace(11) %20, align 4
- %add12.i = add nsw i32 %add10.i, %21
- %22 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %0, i32 0)
- store i32 %add12.i, ptr addrspace(11) %22, align 4
+ %6 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 3, i32 10, i32 2, i32 0, ptr nonnull @.str.12)
+ %7 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 3, i32 10, i32 2, i32 1, ptr nonnull @.str.12)
+ %8 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 10, i32 2, i32 1, i32 0, ptr nonnull @.str.14)
+ %9 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %1, i32 0)
+ %10 = load i32, ptr addrspace(11) %9, align 4
+ %11 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %2, i32 0)
+ %12 = load i32, ptr addrspace(11) %11, align 4
+ %add.i = add nsw i32 %12, %10
+ %13 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %3, i32 0)
+ %14 = load i32, ptr addrspace(11) %13, align 4
+ %add4.i = add nsw i32 %add.i, %14
+ %15 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %4, i32 0)
+ %16 = load i32, ptr addrspace(11) %15, align 4
+ %add6.i = add nsw i32 %add4.i, %16
+ %17 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %5, i32 0)
+ %18 = load i32, ptr addrspace(11) %17, align 4
+ %add8.i = add nsw i32 %add6.i, %18
+ %19 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %6, i32 0)
+ %20 = load i32, ptr addrspace(11) %19, align 4
+ %add10.i = add nsw i32 %add8.i, %20
+ %21 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %7, i32 0)
+ %22 = load i32, ptr addrspace(11) %21, align 4
+ %add12.i = add nsw i32 %add10.i, %22
+ %23 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %8, i32 0)
+ %24 = load i32, ptr addrspace(11) %23, align 4
+ %add14.i = add nsw i32 %add12.i, %24
+ %25 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %0, i32 0)
+ store i32 %add14.i, ptr addrspace(11) %25, align 4
ret void
}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UniqueImplicitBindingNumber.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniqueImplicitBindingNumber.ll
new file mode 100644
index 0000000..c968c99
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniqueImplicitBindingNumber.ll
@@ -0,0 +1,19 @@
+; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; CHECK-ERROR: LLVM ERROR: Implicit binding calls with the same order ID must have the same descriptor set
+
+@.str = private unnamed_addr constant [2 x i8] c"b\00", align 1
+@.str.2 = private unnamed_addr constant [2 x i8] c"c\00", align 1
+
+define void @main() local_unnamed_addr #0 {
+entry:
+ %0 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 0, i32 0, i32 1, i32 0, ptr nonnull @.str)
+ %1 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %0, i32 0)
+ %2 = load i32, ptr addrspace(11) %1, align 4
+ %3 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 0, i32 1, i32 1, i32 0, ptr nonnull @.str.2)
+ %4 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %3, i32 0)
+ store i32 %2, ptr addrspace(11) %4, align 4
+ ret void
+}
+
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll b/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll
index d3d6413..eb7c1b6 100644
--- a/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll
@@ -235,7 +235,7 @@ define half @f12_half(half %dummy, half %val, ptr %dest) {
; CHECK-NEXT: blah %f0
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT
-; CHECK-NEXT: ltebr %f0, %f0
+; CHECK-NEXT: ltebr %f1, %f0
; CHECK-NEXT: jl .LBB11_2
; CHECK-NEXT:# %bb.1:
; CHECK-NEXT: lgdr %r0, %f8
@@ -344,7 +344,7 @@ define half @f15_half(half %val, half %dummy, ptr %dest) {
; CHECK-NEXT: blah %f2
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT
-; CHECK-NEXT: ltebr %f0, %f0
+; CHECK-NEXT: ltebr %f1, %f0
; CHECK-NEXT: jl .LBB15_2
; CHECK-NEXT:# %bb.1:
; CHECK-NEXT: lgdr %r0, %f8
diff --git a/llvm/test/CodeGen/VE/Vector/vec_divrem.ll b/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
index 3bc0aba..93e2889 100644
--- a/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
+++ b/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
@@ -7,19 +7,22 @@
define <4 x i8> @udiv_by_minus_one(<4 x i8> %x) {
; CHECK-LABEL: udiv_by_minus_one:
; CHECK: # %bb.0:
-; CHECK-NEXT: and %s0, %s0, (56)0
-; CHECK-NEXT: lea %s4, 16843010
-; CHECK-NEXT: muls.l %s0, %s0, %s4
-; CHECK-NEXT: srl %s0, %s0, 32
+; CHECK-NEXT: and %s4, %s0, (56)0
; CHECK-NEXT: and %s1, %s1, (56)0
-; CHECK-NEXT: muls.l %s1, %s1, %s4
-; CHECK-NEXT: srl %s1, %s1, 32
; CHECK-NEXT: and %s2, %s2, (56)0
-; CHECK-NEXT: muls.l %s2, %s2, %s4
-; CHECK-NEXT: srl %s2, %s2, 32
; CHECK-NEXT: and %s3, %s3, (56)0
-; CHECK-NEXT: muls.l %s3, %s3, %s4
-; CHECK-NEXT: srl %s3, %s3, 32
+; CHECK-NEXT: or %s0, 0, (0)1
+; CHECK-NEXT: cmpu.w %s5, %s3, (56)0
+; CHECK-NEXT: or %s3, 0, (0)1
+; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s5
+; CHECK-NEXT: cmpu.w %s5, %s2, (56)0
+; CHECK-NEXT: or %s2, 0, (0)1
+; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s5
+; CHECK-NEXT: cmpu.w %s5, %s1, (56)0
+; CHECK-NEXT: or %s1, 0, (0)1
+; CHECK-NEXT: cmov.w.eq %s1, (63)0, %s5
+; CHECK-NEXT: cmpu.w %s4, %s4, (56)0
+; CHECK-NEXT: cmov.w.eq %s0, (63)0, %s4
; CHECK-NEXT: b.l.t (, %s10)
%r = udiv <4 x i8> %x, <i8 255, i8 255, i8 255, i8 255>
ret <4 x i8> %r
@@ -28,27 +31,18 @@ define <4 x i8> @udiv_by_minus_one(<4 x i8> %x) {
define <4 x i8> @urem_by_minus_one(<4 x i8> %x) {
; CHECK-LABEL: urem_by_minus_one:
; CHECK: # %bb.0:
-; CHECK-NEXT: and %s0, %s0, (56)0
-; CHECK-NEXT: and %s1, %s1, (56)0
-; CHECK-NEXT: and %s2, %s2, (56)0
-; CHECK-NEXT: and %s3, %s3, (56)0
-; CHECK-NEXT: lea %s4, 16843010
-; CHECK-NEXT: muls.l %s5, %s3, %s4
-; CHECK-NEXT: srl %s5, %s5, 32
-; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT: subs.w.sx %s3, %s3, %s5
-; CHECK-NEXT: muls.l %s5, %s2, %s4
-; CHECK-NEXT: srl %s5, %s5, 32
-; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT: subs.w.sx %s2, %s2, %s5
-; CHECK-NEXT: muls.l %s5, %s1, %s4
-; CHECK-NEXT: srl %s5, %s5, 32
-; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT: subs.w.sx %s1, %s1, %s5
-; CHECK-NEXT: muls.l %s4, %s0, %s4
-; CHECK-NEXT: srl %s4, %s4, 32
-; CHECK-NEXT: muls.w.sx %s4, %s4, (56)0
-; CHECK-NEXT: subs.w.sx %s0, %s0, %s4
+; CHECK-NEXT: and %s4, %s0, (56)0
+; CHECK-NEXT: and %s5, %s1, (56)0
+; CHECK-NEXT: and %s6, %s2, (56)0
+; CHECK-NEXT: and %s7, %s3, (56)0
+; CHECK-NEXT: cmpu.w %s7, %s7, (56)0
+; CHECK-NEXT: cmov.w.eq %s3, (0)1, %s7
+; CHECK-NEXT: cmpu.w %s6, %s6, (56)0
+; CHECK-NEXT: cmov.w.eq %s2, (0)1, %s6
+; CHECK-NEXT: cmpu.w %s5, %s5, (56)0
+; CHECK-NEXT: cmov.w.eq %s1, (0)1, %s5
+; CHECK-NEXT: cmpu.w %s4, %s4, (56)0
+; CHECK-NEXT: cmov.w.eq %s0, (0)1, %s4
; CHECK-NEXT: b.l.t (, %s10)
%r = urem <4 x i8> %x, <i8 255, i8 255, i8 255, i8 255>
ret <4 x i8> %r
diff --git a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
index 6ef7219..9cf7aab 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
@@ -56,14 +56,9 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) nounwind {
; CHECK-LABEL: PR90954:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbp
-; CHECK-NEXT: movq %rsp, %rbp
-; CHECK-NEXT: pushq %r15
; CHECK-NEXT: pushq %r14
-; CHECK-NEXT: pushq %r13
-; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
-; CHECK-NEXT: subq $5120, %rsp # imm = 0x1400
+; CHECK-NEXT: subq $2912, %rsp # imm = 0xB60
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
@@ -79,29 +74,26 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) nounwind {
; CHECK-NEXT: movw $64, %cx
; CHECK-NEXT: movw $16, %di
; CHECK-NEXT: movb $1, %r8b
-; CHECK-NEXT: movl $64, %r9d
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %r11
-; CHECK-NEXT: xorl %ebx, %ebx
-; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: xorl %r9d, %r9d
+; CHECK-NEXT: xorl %r10d, %r10d
; CHECK-NEXT: jmp .LBB1_1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_5: # in Loop: Header=BB1_1 Depth=1
-; CHECK-NEXT: incq %r14
-; CHECK-NEXT: addl %edx, %ebx
+; CHECK-NEXT: incq %r10
+; CHECK-NEXT: addl %edx, %r9d
; CHECK-NEXT: .LBB1_1: # =>This Loop Header: Depth=1
; CHECK-NEXT: # Child Loop BB1_2 Depth 2
-; CHECK-NEXT: movslq %ebx, %r15
-; CHECK-NEXT: leaq (%rsi,%r15,4), %r15
-; CHECK-NEXT: xorl %r12d, %r12d
-; CHECK-NEXT: xorl %r13d, %r13d
+; CHECK-NEXT: movslq %r9d, %r11
+; CHECK-NEXT: leaq (%rsi,%r11,4), %r11
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: xorl %r14d, %r14d
; CHECK-NEXT: jmp .LBB1_2
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_4: # in Loop: Header=BB1_2 Depth=2
-; CHECK-NEXT: tilestored %tmm1, (%r15,%rax)
-; CHECK-NEXT: incq %r13
-; CHECK-NEXT: addq $64, %r15
-; CHECK-NEXT: decq %r12
+; CHECK-NEXT: tilestored %tmm1, (%r11,%rax)
+; CHECK-NEXT: incq %r14
+; CHECK-NEXT: addq $64, %r11
+; CHECK-NEXT: decq %rbx
; CHECK-NEXT: je .LBB1_5
; CHECK-NEXT: .LBB1_2: # Parent Loop BB1_1 Depth=1
; CHECK-NEXT: # => This Inner Loop Header: Depth=2
@@ -110,46 +102,12 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) nounwind {
; CHECK-NEXT: testb %r8b, %r8b
; CHECK-NEXT: jne .LBB1_4
; CHECK-NEXT: # %bb.3: # in Loop: Header=BB1_2 Depth=2
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: tileloadd (%r10,%r9), %tmm1
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: tileloadd (%r11,%r9), %tmm2
+; CHECK-NEXT: tilezero %tmm1
+; CHECK-NEXT: tilezero %tmm2
; CHECK-NEXT: tdpbf16ps %tmm2, %tmm1, %tmm0
-; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movabsq $64, %rax
-; CHECK-NEXT: tilestored %tmm0, 3072(%rsp,%rax) # 1024-byte Folded Spill
-; CHECK-NEXT: tileloadd 3072(%rsp,%rax), %tmm1 # 1024-byte Folded Reload
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT: movabsq $64, %rbp
+; CHECK-NEXT: tilestored %tmm0, 896(%rsp,%rbp) # 1024-byte Folded Spill
+; CHECK-NEXT: tileloadd 896(%rsp,%rbp), %tmm1 # 1024-byte Folded Reload
; CHECK-NEXT: jmp .LBB1_4
%4 = shl i32 %2, 4
%5 = icmp eq i64 0, 0
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index ec1b8a3..f998128 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -335,84 +335,83 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: andl $-16, %esp
; X86-SLOW-NEXT: subl $32, %esp
-; X86-SLOW-NEXT: movl 24(%ebp), %esi
+; X86-SLOW-NEXT: movl 24(%ebp), %edi
; X86-SLOW-NEXT: movl 28(%ebp), %eax
; X86-SLOW-NEXT: movl 48(%ebp), %edx
; X86-SLOW-NEXT: movl 56(%ebp), %ecx
; X86-SLOW-NEXT: testb $64, %cl
-; X86-SLOW-NEXT: movl 52(%ebp), %edi
+; X86-SLOW-NEXT: movl 52(%ebp), %ebx
; X86-SLOW-NEXT: jne .LBB6_1
; X86-SLOW-NEXT: # %bb.2:
; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT: movl %esi, %edx
-; X86-SLOW-NEXT: movl 32(%ebp), %esi
-; X86-SLOW-NEXT: movl %edi, %ecx
-; X86-SLOW-NEXT: movl %eax, %edi
+; X86-SLOW-NEXT: movl %edi, %edx
+; X86-SLOW-NEXT: movl 32(%ebp), %edi
+; X86-SLOW-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl %eax, %ebx
; X86-SLOW-NEXT: movl 36(%ebp), %eax
; X86-SLOW-NEXT: jmp .LBB6_3
; X86-SLOW-NEXT: .LBB6_1:
; X86-SLOW-NEXT: movl 40(%ebp), %ecx
; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: movl 44(%ebp), %ecx
+; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: .LBB6_3:
-; X86-SLOW-NEXT: movl 56(%ebp), %ebx
-; X86-SLOW-NEXT: testb $32, %bl
+; X86-SLOW-NEXT: movl 56(%ebp), %ecx
+; X86-SLOW-NEXT: testb $32, %cl
; X86-SLOW-NEXT: jne .LBB6_4
; X86-SLOW-NEXT: # %bb.5:
-; X86-SLOW-NEXT: movl %ecx, %ebx
; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl %edx, %edi
+; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: jmp .LBB6_6
; X86-SLOW-NEXT: .LBB6_4:
-; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT: movl %ecx, %edx
-; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-SLOW-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl %edx, %ebx
+; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-SLOW-NEXT: .LBB6_6:
-; X86-SLOW-NEXT: movl %edx, %esi
+; X86-SLOW-NEXT: movl %edi, %eax
+; X86-SLOW-NEXT: shll %cl, %eax
+; X86-SLOW-NEXT: shrl %esi
+; X86-SLOW-NEXT: movl %ecx, %edx
+; X86-SLOW-NEXT: notb %dl
+; X86-SLOW-NEXT: movl %edx, %ecx
+; X86-SLOW-NEXT: shrl %cl, %esi
+; X86-SLOW-NEXT: orl %eax, %esi
+; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl %ebx, %eax
; X86-SLOW-NEXT: movl 56(%ebp), %ecx
-; X86-SLOW-NEXT: shll %cl, %esi
-; X86-SLOW-NEXT: movl %ebx, %edi
+; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT: shll %cl, %eax
; X86-SLOW-NEXT: shrl %edi
-; X86-SLOW-NEXT: movl %ecx, %ebx
-; X86-SLOW-NEXT: notb %bl
-; X86-SLOW-NEXT: movl %ebx, %ecx
-; X86-SLOW-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shrl %cl, %edi
-; X86-SLOW-NEXT: orl %esi, %edi
+; X86-SLOW-NEXT: orl %eax, %edi
; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-SLOW-NEXT: movl %esi, %eax
; X86-SLOW-NEXT: movl 56(%ebp), %ecx
; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-SLOW-NEXT: shll %cl, %eax
-; X86-SLOW-NEXT: shrl %edx
-; X86-SLOW-NEXT: movl %ebx, %ecx
-; X86-SLOW-NEXT: shrl %cl, %edx
-; X86-SLOW-NEXT: orl %eax, %edx
-; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-SLOW-NEXT: movl %ebx, %eax
+; X86-SLOW-NEXT: shrl %ebx
+; X86-SLOW-NEXT: movl %edx, %ecx
+; X86-SLOW-NEXT: shrl %cl, %ebx
+; X86-SLOW-NEXT: orl %eax, %ebx
; X86-SLOW-NEXT: movl 56(%ebp), %ecx
; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SLOW-NEXT: shll %cl, %eax
; X86-SLOW-NEXT: shrl %esi
-; X86-SLOW-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shrl %cl, %esi
; X86-SLOW-NEXT: orl %eax, %esi
-; X86-SLOW-NEXT: movl 56(%ebp), %ecx
-; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SLOW-NEXT: shll %cl, %eax
-; X86-SLOW-NEXT: shrl %ebx
-; X86-SLOW-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-SLOW-NEXT: shrl %cl, %ebx
-; X86-SLOW-NEXT: orl %eax, %ebx
; X86-SLOW-NEXT: movl 8(%ebp), %eax
-; X86-SLOW-NEXT: movl %ebx, 12(%eax)
-; X86-SLOW-NEXT: movl %esi, 8(%eax)
-; X86-SLOW-NEXT: movl %edx, 4(%eax)
-; X86-SLOW-NEXT: movl %edi, (%eax)
+; X86-SLOW-NEXT: movl %esi, 12(%eax)
+; X86-SLOW-NEXT: movl %ebx, 8(%eax)
+; X86-SLOW-NEXT: movl %edi, 4(%eax)
+; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SLOW-NEXT: movl %ecx, (%eax)
; X86-SLOW-NEXT: leal -12(%ebp), %esp
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 544ab7f..c307833 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -322,79 +322,79 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
; X86-SLOW-NEXT: subl $16, %esp
; X86-SLOW-NEXT: movl 24(%ebp), %edx
; X86-SLOW-NEXT: movl 28(%ebp), %esi
-; X86-SLOW-NEXT: movl 48(%ebp), %ebx
+; X86-SLOW-NEXT: movl 48(%ebp), %edi
; X86-SLOW-NEXT: movl 56(%ebp), %eax
; X86-SLOW-NEXT: testb $64, %al
-; X86-SLOW-NEXT: movl 52(%ebp), %edi
+; X86-SLOW-NEXT: movl 52(%ebp), %eax
; X86-SLOW-NEXT: je .LBB6_1
; X86-SLOW-NEXT: # %bb.2:
-; X86-SLOW-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT: movl %edx, %ebx
+; X86-SLOW-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT: movl %edx, %edi
; X86-SLOW-NEXT: movl 32(%ebp), %edx
-; X86-SLOW-NEXT: movl %edi, %eax
-; X86-SLOW-NEXT: movl %esi, %edi
+; X86-SLOW-NEXT: movl %eax, %ecx
+; X86-SLOW-NEXT: movl %esi, %eax
; X86-SLOW-NEXT: movl 36(%ebp), %esi
; X86-SLOW-NEXT: jmp .LBB6_3
; X86-SLOW-NEXT: .LBB6_1:
-; X86-SLOW-NEXT: movl 40(%ebp), %eax
-; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT: movl 44(%ebp), %eax
+; X86-SLOW-NEXT: movl 40(%ebp), %ecx
+; X86-SLOW-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT: movl 44(%ebp), %ecx
; X86-SLOW-NEXT: .LBB6_3:
-; X86-SLOW-NEXT: movl 56(%ebp), %ecx
-; X86-SLOW-NEXT: testb $32, %cl
+; X86-SLOW-NEXT: movl 56(%ebp), %ebx
+; X86-SLOW-NEXT: testb $32, %bl
; X86-SLOW-NEXT: je .LBB6_4
; X86-SLOW-NEXT: # %bb.5:
-; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl %ecx, %ebx
; X86-SLOW-NEXT: jmp .LBB6_6
; X86-SLOW-NEXT: .LBB6_4:
; X86-SLOW-NEXT: movl %edx, %esi
+; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT: movl %eax, %ebx
-; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-SLOW-NEXT: movl %ecx, %edi
+; X86-SLOW-NEXT: movl (%esp), %ebx # 4-byte Reload
; X86-SLOW-NEXT: .LBB6_6:
-; X86-SLOW-NEXT: shrl %cl, %eax
-; X86-SLOW-NEXT: movl %eax, %edx
-; X86-SLOW-NEXT: movl %ecx, %eax
-; X86-SLOW-NEXT: notb %al
-; X86-SLOW-NEXT: movl %ebx, %edi
-; X86-SLOW-NEXT: addl %ebx, %ebx
-; X86-SLOW-NEXT: movl %eax, %ecx
-; X86-SLOW-NEXT: shll %cl, %ebx
-; X86-SLOW-NEXT: orl %edx, %ebx
-; X86-SLOW-NEXT: movl %ebx, (%esp) # 4-byte Spill
; X86-SLOW-NEXT: movl 56(%ebp), %ecx
-; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT: shrl %cl, %edi
-; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-SLOW-NEXT: leal (%ebx,%ebx), %edx
-; X86-SLOW-NEXT: movl %eax, %ecx
-; X86-SLOW-NEXT: shll %cl, %edx
-; X86-SLOW-NEXT: orl %edi, %edx
+; X86-SLOW-NEXT: shrl %cl, %ebx
+; X86-SLOW-NEXT: movl %ecx, %edx
+; X86-SLOW-NEXT: notb %dl
+; X86-SLOW-NEXT: movl %edi, %eax
+; X86-SLOW-NEXT: addl %edi, %edi
+; X86-SLOW-NEXT: movl %edx, %ecx
+; X86-SLOW-NEXT: shll %cl, %edi
+; X86-SLOW-NEXT: orl %ebx, %edi
+; X86-SLOW-NEXT: movl %edi, (%esp) # 4-byte Spill
; X86-SLOW-NEXT: movl 56(%ebp), %ecx
; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT: shrl %cl, %ebx
-; X86-SLOW-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: shrl %cl, %eax
; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-SLOW-NEXT: leal (%edi,%edi), %ebx
-; X86-SLOW-NEXT: movl %eax, %ecx
+; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shll %cl, %ebx
-; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-SLOW-NEXT: orl %eax, %ebx
; X86-SLOW-NEXT: movl 56(%ebp), %ecx
; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-SLOW-NEXT: shrl %cl, %edi
+; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SLOW-NEXT: leal (%eax,%eax), %edi
+; X86-SLOW-NEXT: movl %edx, %ecx
+; X86-SLOW-NEXT: shll %cl, %edi
+; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-SLOW-NEXT: movl 56(%ebp), %ecx
+; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT: shrl %cl, %eax
; X86-SLOW-NEXT: addl %esi, %esi
-; X86-SLOW-NEXT: movl %eax, %ecx
+; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shll %cl, %esi
-; X86-SLOW-NEXT: orl %edi, %esi
-; X86-SLOW-NEXT: movl 8(%ebp), %ecx
-; X86-SLOW-NEXT: movl %esi, 12(%ecx)
-; X86-SLOW-NEXT: movl %ebx, 8(%ecx)
-; X86-SLOW-NEXT: movl %edx, 4(%ecx)
-; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-SLOW-NEXT: movl %eax, (%ecx)
-; X86-SLOW-NEXT: movl %ecx, %eax
+; X86-SLOW-NEXT: orl %eax, %esi
+; X86-SLOW-NEXT: movl 8(%ebp), %eax
+; X86-SLOW-NEXT: movl %esi, 12(%eax)
+; X86-SLOW-NEXT: movl %edi, 8(%eax)
+; X86-SLOW-NEXT: movl %ebx, 4(%eax)
+; X86-SLOW-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SLOW-NEXT: movl %ecx, (%eax)
; X86-SLOW-NEXT: leal -12(%ebp), %esp
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/pr161693.ll b/llvm/test/CodeGen/X86/pr161693.ll
new file mode 100644
index 0000000..de8188f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr161693.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+
+define void @PR161693() #0 {
+; CHECK-LABEL: PR161693:
+; CHECK: # %bb.0: # %start
+; CHECK-NEXT: movzbl (%rax), %eax
+; CHECK-NEXT: andb $-33, %al
+; CHECK-NEXT: addb $-71, %al
+; CHECK-NEXT: .p2align 4
+; CHECK-NEXT: .LBB0_1: # %loop
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: cmpb $-6, %al
+; CHECK-NEXT: setb %cl
+; CHECK-NEXT: leal (%rcx,%rcx), %edx
+; CHECK-NEXT: orb %cl, %dl
+; CHECK-NEXT: leal (,%rdx,4), %ecx
+; CHECK-NEXT: orb %dl, %cl
+; CHECK-NEXT: je .LBB0_1
+; CHECK-NEXT: # %bb.2: # %exit
+; CHECK-NEXT: retq
+start:
+ br label %loop
+
+loop:
+ %.val.i.i89 = load <16 x i8>, ptr poison, align 1
+ %.not49.i = icmp ult <16 x i8> zeroinitializer, splat (i8 -10)
+ %i = and <16 x i8> %.val.i.i89, splat (i8 -33)
+ %i1 = add <16 x i8> %i, splat (i8 -71)
+ %.not51.i = icmp ult <16 x i8> %i1, splat (i8 -6)
+ %.not46.i = and <16 x i1> %.not49.i, %.not51.i
+ %i2 = bitcast <16 x i1> %.not46.i to i16
+ %_0.i = icmp eq i16 %i2, 0
+ br i1 %_0.i, label %loop, label %exit
+
+exit:
+ ret void
+}
+
+attributes #0 = { "target-features"="+soft-float" }
diff --git a/llvm/test/CodeGen/X86/sbb.ll b/llvm/test/CodeGen/X86/sbb.ll
index 78d609d..f5a3468 100644
--- a/llvm/test/CodeGen/X86/sbb.ll
+++ b/llvm/test/CodeGen/X86/sbb.ll
@@ -365,3 +365,32 @@ define i32 @uge_sext_add(i32 %0, i32 %1, i32 %2) {
%6 = add nsw i32 %5, %0
ret i32 %6
}
+
+define i32 @sub_sub_ugt(i32 %a, i32 %b) {
+; CHECK-LABEL: sub_sub_ugt:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: cmpl %edi, %esi
+; CHECK-NEXT: sbbl %esi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp ugt i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ %sub = sub i32 %a, %b
+ %res = sub i32 %sub, %conv
+ ret i32 %res
+}
+
+define i32 @sub_sub_ult(i32 %a, i32 %b) {
+; CHECK-LABEL: sub_sub_ult:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: cmpl %edi, %esi
+; CHECK-NEXT: sbbl %esi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp ult i32 %b, %a
+ %conv = zext i1 %cmp to i32
+ %sub = sub i32 %a, %b
+ %res = sub i32 %sub, %conv
+ ret i32 %res
+}
+
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 7462c77..049ee47 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -613,8 +613,7 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
; i686-NEXT: shldl %cl, %esi, %ebx
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; i686-NEXT: movl %edi, %esi
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, %ecx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; i686-NEXT: shll %cl, %esi
; i686-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; i686-NEXT: negl %edx
diff --git a/llvm/test/DebugInfo/AArch64/asan-stack-vars.mir b/llvm/test/DebugInfo/AArch64/asan-stack-vars.mir
index 5d644c3..718fa6f 100644
--- a/llvm/test/DebugInfo/AArch64/asan-stack-vars.mir
+++ b/llvm/test/DebugInfo/AArch64/asan-stack-vars.mir
@@ -366,7 +366,8 @@ frameInfo:
maxCallFrameSize: 0
localFrameSize: 144
machineFunctionInfo:
- stackSizeSVE: 0
+ stackSizeZPR: 0
+ stackSizePPR: 0
stack:
- { id: 0, name: StackGuardSlot, offset: -40, size: 8, alignment: 8,
stack-id: default, local-offset: -8 }
diff --git a/llvm/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir b/llvm/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir
index 013d933..b7a9892 100644
--- a/llvm/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir
+++ b/llvm/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir
@@ -69,7 +69,8 @@ frameInfo:
hasCalls: true
maxCallFrameSize: 0
machineFunctionInfo:
- stackSizeSVE: 0
+ stackSizeZPR: 0
+ stackSizePPR: 0
stack:
- { id: 0, type: spill-slot, offset: -20, size: 4, alignment: 4, stack-id: default }
- { id: 1, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: default,
diff --git a/llvm/test/DebugInfo/X86/dynamic-bitfield.ll b/llvm/test/DebugInfo/X86/dynamic-bitfield.ll
index c9148ca4..f893597 100644
--- a/llvm/test/DebugInfo/X86/dynamic-bitfield.ll
+++ b/llvm/test/DebugInfo/X86/dynamic-bitfield.ll
@@ -27,7 +27,7 @@ source_filename = "bitfield.c"
!6 = !{}
!7 = !{!0, !2}
!8 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "PackedBits", file: !5, line: 3, size: 40, elements: !9)
-!9 = !{!10, !12, !16}
+!9 = !{!10, !12, !16, !21}
!10 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !8, file: !5, line: 5, baseType: !11, size: 8)
; CHECK: DW_TAG_member
; CHECK-NEXT: DW_AT_name{{.*}}"a"
@@ -60,5 +60,14 @@ source_filename = "bitfield.c"
; CHECK: DW_AT_bit_size [DW_FORM_exprloc] (DW_OP_lit27)
; CHECK-NEXT: DW_AT_data_bit_offset [DW_FORM_exprloc] (DW_OP_lit13)
; CHECK-NOT: DW_AT_data_member_location
-; CHECK: DW_TAG
!20 = !{!"clang version 3.9.0 (trunk 267633)"}
+!21 = !DIDerivedType(tag: DW_TAG_member, name: "d", scope: !8, file: !5, line: 7, baseType: !13, offset: !DIExpression(DW_OP_constu, 15), flags: DIFlagBitField)
+; CHECK: DW_TAG_member
+; CHECK-NEXT: DW_AT_name{{.*}}"d"
+; CHECK-NOT: DW_TAG
+; CHECK-NOT: DW_AT_bit_offset
+; CHECK-NOT: DW_AT_byte_size
+; CHECK-NOT: DW_AT_bit_size
+; CHECK: DW_AT_data_bit_offset [DW_FORM_exprloc] (DW_OP_lit15)
+; CHECK-NOT: DW_AT_data_member_location
+; CHECK: DW_TAG
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
index 9296f04..ed76a28 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
@@ -22,7 +22,7 @@
# CHECK-OBJ: Contents of section .rodata:
# CHECK-OBJ: 0000 48310048 32004833 00 H1.H2.H3.
-# CHECK-LG: Starting link phase 1 for graph
+# CHECK-LG: Starting link phase 1
# CHECK-LG: section .rodata:
# CHECK-LG: block 0x0 size = 0x00000009, align = 1, alignment-offset = 0
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-0.s b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-0.s
new file mode 100644
index 0000000..557e403
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-0.s
@@ -0,0 +1,7 @@
+ .section __DATA,__data
+ .globl x
+ .p2align 2, 0x0
+x:
+ .long 0
+
+.subsections_via_symbols
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-1.s b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-1.s
new file mode 100644
index 0000000..711c8a0
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-1.s
@@ -0,0 +1,7 @@
+ .section __DATA,__data
+ .globl x
+ .p2align 2, 0x0
+x:
+ .long 1
+
+.subsections_via_symbols
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_universal_slice_selection.s b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_universal_slice_selection.s
new file mode 100644
index 0000000..c58f84e
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_universal_slice_selection.s
@@ -0,0 +1,32 @@
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=arm64e-apple-darwin -filetype=obj -o %t/main.o %s
+# RUN: llvm-mc -triple=arm64-apple-darwin -filetype=obj -o %t/x.arm64.o \
+# RUN: %S/Inputs/x-1.s
+# RUN: llvm-ar crs %t/libX.arm64.a %t/x.arm64.o
+# RUN: llvm-mc -triple=arm64e-apple-darwin -filetype=obj -o %t/x.arm64e.o \
+# RUN: %S/Inputs/x-0.s
+# RUN: llvm-ar crs %t/libX.arm64e.a %t/x.arm64e.o
+# RUN: llvm-lipo --create --output %t/libX.a %t/libX.arm64.a %t/libX.arm64e.a
+# RUN: llvm-jitlink -noexec -check=%s %t/main.o -L%t -lX
+#
+# Create a universal archive with two slices (arm64e, arm64) each containing
+# a definition of X: in arm64e X = 0, in arm64 X = 1.
+# Check that if we load an arm64e object file then we link the arm64e slice
+# of the archive by verifying that X = 0.
+#
+
+# jitlink-check: *{4}x = 0
+
+ .section __TEXT,__text,regular,pure_instructions
+ .globl _main
+ .p2align 2
+_main:
+ mov w0, #0
+ ret
+
+ .section __DATA,__data
+ .globl p
+p:
+ .quad x
+
+.subsections_via_symbols
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
index 2b5c9e3..5f6babf 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
@@ -102,7 +102,7 @@ p:
call o
.size p, .-p
-# CHECK: Link graph "{{.*}}" before copy-and-fixup:
+# CHECK: Link graph before copy-and-fixup:
# CHECK: section .text:
# CHECK: block 0x1000
# CHECK: symbols:
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
index 3bbfd55..c31250b 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
@@ -131,7 +131,7 @@ p:
call o
.size p, .-p
-# CHECK: Link graph "{{.*}}" before copy-and-fixup:
+# CHECK: Link graph before copy-and-fixup:
# CHECK: section .text:
# CHECK: block 0x1000
# CHECK: symbols:
diff --git a/llvm/test/Transforms/GVN/condprop.ll b/llvm/test/Transforms/GVN/condprop.ll
index 15ffcbf..eb2a9f1 100644
--- a/llvm/test/Transforms/GVN/condprop.ll
+++ b/llvm/test/Transforms/GVN/condprop.ll
@@ -321,6 +321,66 @@ different:
ret i1 %cmp3
}
+define i1 @test6_phi1(i1 %c, i32 %x, i32 %y) {
+; CHECK-LABEL: @test6_phi1(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp ne i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK: bb1:
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X]], [[Y]]
+; CHECK-NEXT: br i1 [[CMP]], label [[BB2]], label [[BB3:%.*]]
+; CHECK: bb2:
+; CHECK-NEXT: [[PHI:%.*]] = phi i1 [ false, [[BB1]] ], [ true, [[ENTRY:%.*]] ]
+; CHECK-NEXT: ret i1 [[PHI]]
+; CHECK: bb3:
+; CHECK-NEXT: ret i1 false
+;
+entry:
+ %cmp.not = icmp ne i32 %x, %y
+ br i1 %c, label %bb1, label %bb2
+
+bb1:
+ %cmp = icmp eq i32 %x, %y
+ br i1 %cmp, label %bb2, label %bb3
+
+bb2:
+ %phi = phi i1 [ %cmp.not, %bb1 ], [ true, %entry ]
+ ret i1 %phi
+
+bb3:
+ ret i1 false
+}
+
+define i1 @test6_phi2(i1 %c, i32 %x, i32 %y) {
+; CHECK-LABEL: @test6_phi2(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK: bb1:
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp ne i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X]], [[Y]]
+; CHECK-NEXT: br i1 [[CMP]], label [[BB2]], label [[BB3:%.*]]
+; CHECK: bb2:
+; CHECK-NEXT: [[PHI:%.*]] = phi i1 [ [[CMP_NOT]], [[BB1]] ], [ true, [[ENTRY:%.*]] ]
+; CHECK-NEXT: ret i1 [[PHI]]
+; CHECK: bb3:
+; CHECK-NEXT: ret i1 false
+;
+entry:
+ br i1 %c, label %bb1, label %bb2
+
+bb1:
+ %cmp.not = icmp ne i32 %x, %y
+ %cmp = icmp eq i32 %x, %y
+ br i1 %cmp, label %bb2, label %bb3
+
+bb2:
+ %phi = phi i1 [ %cmp.not, %bb1 ], [ true, %entry ]
+ ret i1 %phi
+
+bb3:
+ ret i1 false
+}
+
define i1 @test7(i32 %x, i32 %y) {
; CHECK-LABEL: @test7(
; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/fcmp.ll b/llvm/test/Transforms/InstCombine/fcmp.ll
index 119cffd..d94e78c 100644
--- a/llvm/test/Transforms/InstCombine/fcmp.ll
+++ b/llvm/test/Transforms/InstCombine/fcmp.ll
@@ -1812,6 +1812,46 @@ define i1 @fcmp_ule_fsub_const(float %x, float %y) {
ret i1 %cmp
}
+define i1 @fcmp_ninf_ule_fsub_const(float %x, float %y) {
+; CHECK-LABEL: @fcmp_ninf_ule_fsub_const(
+; CHECK-NEXT: [[CMP:%.*]] = fcmp ule float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %fs = fsub float %x, %y
+ %cmp = fcmp ninf ule float %fs, 0.000000e+00
+ ret i1 %cmp
+}
+
+define i1 @fcmp_nnan_ule_fsub_const(float %x, float %y) {
+; CHECK-LABEL: @fcmp_nnan_ule_fsub_const(
+; CHECK-NEXT: [[CMP:%.*]] = fcmp nnan ule float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %fs = fsub float %x, %y
+ %cmp = fcmp nnan ule float %fs, 0.000000e+00
+ ret i1 %cmp
+}
+
+define i1 @fcmp_ule_fsub_ninf_const(float %x, float %y) {
+; CHECK-LABEL: @fcmp_ule_fsub_ninf_const(
+; CHECK-NEXT: [[CMP:%.*]] = fcmp ninf ule float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %fs = fsub ninf float %x, %y
+ %cmp = fcmp ule float %fs, 0.000000e+00
+ ret i1 %cmp
+}
+
+define i1 @fcmp_ule_fsub_nnan_const(float %x, float %y) {
+; CHECK-LABEL: @fcmp_ule_fsub_nnan_const(
+; CHECK-NEXT: [[CMP:%.*]] = fcmp nnan ule float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %fs = fsub nnan float %x, %y
+ %cmp = fcmp ule float %fs, 0.000000e+00
+ ret i1 %cmp
+}
+
define i1 @fcmp_ugt_fsub_const(float %x, float %y) {
; CHECK-LABEL: @fcmp_ugt_fsub_const(
; CHECK-NEXT: [[FS:%.*]] = fsub float [[X:%.*]], [[Y:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/icmp-clamp.ll b/llvm/test/Transforms/InstCombine/icmp-clamp.ll
new file mode 100644
index 0000000..4866dbf
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/icmp-clamp.ll
@@ -0,0 +1,295 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+declare void @use(i32)
+
+define i1 @test_i32_eq(i32 %x) {
+; CHECK-LABEL: define i1 @test_i32_eq(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[X]], 95
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], 256
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95)
+ %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+ %cmp = icmp eq i32 %v2, %x
+ ret i1 %cmp
+}
+
+define i1 @test_i32_ne(i32 %x) {
+; CHECK-LABEL: define i1 @test_i32_ne(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[X]], -161
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], -256
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95)
+ %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+ %cmp = icmp ne i32 %v2, %x
+ ret i1 %cmp
+}
+
+define i1 @test_i32_eq_no_add(i32 %x) {
+; CHECK-LABEL: define i1 @test_i32_eq_no_add(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X]], 161
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 0)
+ %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+ %cmp = icmp eq i32 %v2, %x
+ ret i1 %cmp
+}
+
+define i1 @test_i32_ne_no_add(i32 %x) {
+; CHECK-LABEL: define i1 @test_i32_ne_no_add(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[X]], 160
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 0)
+ %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+ %cmp = icmp ne i32 %v2, %x
+ ret i1 %cmp
+}
+
+define i1 @test_unsigned_eq(i32 %x) {
+; CHECK-LABEL: define i1 @test_unsigned_eq(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[X]], -10
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], 91
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i32 @llvm.umax.i32(i32 %x, i32 10)
+ %v2 = tail call i32 @llvm.umin.i32(i32 %v1, i32 100)
+ %cmp = icmp eq i32 %v2, %x
+ ret i1 %cmp
+}
+
+define i1 @test_unsigned_ne(i32 %x) {
+; CHECK-LABEL: define i1 @test_unsigned_ne(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[X]], -101
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], -91
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i32 @llvm.umax.i32(i32 %x, i32 10)
+ %v2 = tail call i32 @llvm.umin.i32(i32 %v1, i32 100)
+ %cmp = icmp ne i32 %v2, %x
+ ret i1 %cmp
+}
+
+
+; Different bit widths
+define i1 @test_i8_eq(i8 %x) {
+; CHECK-LABEL: define i1 @test_i8_eq(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X]], 50
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[TMP1]], 101
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i8 @llvm.smax.i8(i8 %x, i8 -50)
+ %v2 = tail call i8 @llvm.smin.i8(i8 %v1, i8 50)
+ %cmp = icmp eq i8 %v2, %x
+ ret i1 %cmp
+}
+
+define i1 @test_i16_eq(i16 %x) {
+; CHECK-LABEL: define i1 @test_i16_eq(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[X]], 1000
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i16 [[TMP1]], 2001
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i16 @llvm.smax.i16(i16 %x, i16 -1000)
+ %v2 = tail call i16 @llvm.smin.i16(i16 %v1, i16 1000)
+ %cmp = icmp eq i16 %v2, %x
+ ret i1 %cmp
+}
+
+define i1 @test_i64_eq(i64 %x) {
+; CHECK-LABEL: define i1 @test_i64_eq(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[X]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[TMP1]], -1
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i64 @llvm.smax.i64(i64 %x, i64 -1)
+ %v2 = tail call i64 @llvm.smin.i64(i64 %v1, i64 9223372036854775806)
+ %cmp = icmp eq i64 %v2, %x
+ ret i1 %cmp
+}
+
+; Negative tests - wrong predicate
+define i1 @test_wrong_pred_slt(i32 %x) {
+; CHECK-LABEL: define i1 @test_wrong_pred_slt(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X]], 160
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95)
+ %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+ %cmp = icmp slt i32 %v2, %x
+ ret i1 %cmp
+}
+
+
+; Negative tests - not a clamp pattern
+define i1 @test_not_clamp_pattern(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @test_not_clamp_pattern(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[Y]], i32 -95)
+; CHECK-NEXT: [[V2:%.*]] = tail call i32 @llvm.smin.i32(i32 [[V1]], i32 160)
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[V2]], [[X]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i32 @llvm.smax.i32(i32 %y, i32 -95)
+ %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+ %cmp = icmp eq i32 %v2, %x
+ ret i1 %cmp
+}
+
+; Negative tests - Lo >= Hi
+define i1 @test_invalid_range(i32 %x) {
+; CHECK-LABEL: define i1 @test_invalid_range(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X]], 50
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 100)
+ %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 50)
+ %cmp = icmp eq i32 %v2, %x
+ ret i1 %cmp
+}
+
+; Negative tests - Lo is minimum signed value
+define i1 @test_lo_min_signed(i32 %x) {
+; CHECK-LABEL: define i1 @test_lo_min_signed(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X]], 161
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -2147483648)
+ %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+ %cmp = icmp eq i32 %v2, %x
+ ret i1 %cmp
+}
+
+; Negative tests - Hi is maximum signed value
+define i1 @test_hi_max_signed(i32 %x) {
+; CHECK-LABEL: define i1 @test_hi_max_signed(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X]], -96
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95)
+ %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 2147483647)
+ %cmp = icmp eq i32 %v2, %x
+ ret i1 %cmp
+}
+
+; Negative tests - Hi is maximum unsigned value
+define i1 @test_hi_max_unsigned(i32 %x) {
+; CHECK-LABEL: define i1 @test_hi_max_unsigned(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[X]], 9
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i32 @llvm.umax.i32(i32 %x, i32 10)
+ %v2 = tail call i32 @llvm.umin.i32(i32 %v1, i32 4294967295)
+ %cmp = icmp eq i32 %v2, %x
+ ret i1 %cmp
+}
+
+; Multi-use tests - multiple uses of max
+define i1 @test_multi_use_max(i32 %x) {
+; CHECK-LABEL: define i1 @test_multi_use_max(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[X]], i32 -95)
+; CHECK-NEXT: call void @use(i32 [[V1]])
+; CHECK-NEXT: [[V2:%.*]] = tail call i32 @llvm.smin.i32(i32 [[V1]], i32 160)
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[V2]], [[X]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95)
+ call void @use(i32 %v1)
+ %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+ %cmp = icmp eq i32 %v2, %x
+ ret i1 %cmp
+}
+
+; Multi-use tests - multiple uses of min
+define i1 @test_multi_use_min(i32 %x) {
+; CHECK-LABEL: define i1 @test_multi_use_min(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[X]], i32 -95)
+; CHECK-NEXT: [[V2:%.*]] = tail call i32 @llvm.smin.i32(i32 [[V1]], i32 160)
+; CHECK-NEXT: call void @use(i32 [[V2]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[V2]], [[X]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95)
+ %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+ call void @use(i32 %v2)
+ %cmp = icmp eq i32 %v2, %x
+ ret i1 %cmp
+}
+
+; Commuted tests
+define i1 @test_commuted_eq(i32 %x) {
+; CHECK-LABEL: define i1 @test_commuted_eq(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[X]], 95
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], 256
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95)
+ %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+ %cmp = icmp eq i32 %x, %v2
+ ret i1 %cmp
+}
+
+
+; Vector tests - splat constants
+define <2 x i1> @test_vec_splat_eq(<2 x i32> %x) {
+; CHECK-LABEL: define <2 x i1> @test_vec_splat_eq(
+; CHECK-SAME: <2 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[X]], splat (i32 50)
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult <2 x i32> [[TMP1]], splat (i32 101)
+; CHECK-NEXT: ret <2 x i1> [[CMP]]
+;
+ %v1 = tail call <2 x i32> @llvm.smax.v2i32(<2 x i32> %x, <2 x i32> <i32 -50, i32 -50>)
+ %v2 = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> %v1, <2 x i32> <i32 50, i32 50>)
+ %cmp = icmp eq <2 x i32> %v2, %x
+ ret <2 x i1> %cmp
+}
+
+; Vector tests - poison elements
+define <2 x i1> @test_vec_poison_eq(<2 x i32> %x) {
+; CHECK-LABEL: define <2 x i1> @test_vec_poison_eq(
+; CHECK-SAME: <2 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[V1:%.*]] = tail call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[X]], <2 x i32> <i32 -50, i32 poison>)
+; CHECK-NEXT: [[V2:%.*]] = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[V1]], <2 x i32> <i32 50, i32 poison>)
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> [[V2]], [[X]]
+; CHECK-NEXT: ret <2 x i1> [[CMP]]
+;
+ %v1 = tail call <2 x i32> @llvm.smax.v2i32(<2 x i32> %x, <2 x i32> <i32 -50, i32 poison>)
+ %v2 = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> %v1, <2 x i32> <i32 50, i32 poison>)
+ %cmp = icmp eq <2 x i32> %v2, %x
+ ret <2 x i1> %cmp
+}
+
+; Vector tests - non-splat
+define <2 x i1> @test_vec_non_splat_eq(<2 x i32> %x) {
+; CHECK-LABEL: define <2 x i1> @test_vec_non_splat_eq(
+; CHECK-SAME: <2 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[V1:%.*]] = tail call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[X]], <2 x i32> <i32 -50, i32 -30>)
+; CHECK-NEXT: [[V2:%.*]] = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[V1]], <2 x i32> <i32 50, i32 70>)
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> [[V2]], [[X]]
+; CHECK-NEXT: ret <2 x i1> [[CMP]]
+;
+ %v1 = tail call <2 x i32> @llvm.smax.v2i32(<2 x i32> %x, <2 x i32> <i32 -50, i32 -30>)
+ %v2 = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> %v1, <2 x i32> <i32 50, i32 70>)
+ %cmp = icmp eq <2 x i32> %v2, %x
+ ret <2 x i1> %cmp
+}
diff --git a/llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll b/llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll
new file mode 100644
index 0000000..1339afe
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll
@@ -0,0 +1,75 @@
+; Test branch weight metadata, estimated trip count metadata, and block
+; frequencies after loop peeling.
+
+; RUN: opt < %s -S -passes='print<block-freq>' 2>&1 | \
+; RUN: FileCheck -check-prefix=CHECK %s
+
+; The -implicit-check-not options make sure that no additional labels or calls
+; to @f show up.
+; RUN: opt < %s -S -passes='loop-unroll,print<block-freq>' \
+; RUN: -unroll-force-peel-count=2 2>&1 | \
+; RUN: FileCheck %s -check-prefix=CHECK-UR \
+; RUN: -implicit-check-not='{{^[^ ;]*:}}' \
+; RUN: -implicit-check-not='call void @f'
+
+; CHECK: block-frequency-info: test
+; CHECK: do.body: float = 10.0,
+
+; The sum should still be ~10.
+;
+; CHECK-UR: block-frequency-info: test
+; CHECK-UR: - [[DO_BODY_PEEL:.*]]: float = 1.0,
+; CHECK-UR: - [[DO_BODY_PEEL2:.*]]: float = 0.9,
+; CHECK-UR: - [[DO_BODY:.*]]: float = 8.1,
+
+declare void @f(i32)
+
+define void @test(i32 %n) {
+; CHECK-UR-LABEL: define void @test(
+; CHECK-UR: [[ENTRY:.*]]:
+; CHECK-UR: br label %[[DO_BODY_PEEL_BEGIN:.*]]
+; CHECK-UR: [[DO_BODY_PEEL_BEGIN]]:
+; CHECK-UR: br label %[[DO_BODY_PEEL:.*]]
+; CHECK-UR: [[DO_BODY_PEEL]]:
+; CHECK-UR: call void @f
+; CHECK-UR: br i1 %{{.*}}, label %[[DO_END:.*]], label %[[DO_BODY_PEEL_NEXT:.*]], !prof ![[#PROF:]]
+; CHECK-UR: [[DO_BODY_PEEL_NEXT]]:
+; CHECK-UR: br label %[[DO_BODY_PEEL2:.*]]
+; CHECK-UR: [[DO_BODY_PEEL2]]:
+; CHECK-UR: call void @f
+; CHECK-UR: br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY_PEEL_NEXT1:.*]], !prof ![[#PROF]]
+; CHECK-UR: [[DO_BODY_PEEL_NEXT1]]:
+; CHECK-UR: br label %[[DO_BODY_PEEL_NEXT5:.*]]
+; CHECK-UR: [[DO_BODY_PEEL_NEXT5]]:
+; CHECK-UR: br label %[[ENTRY_PEEL_NEWPH:.*]]
+; CHECK-UR: [[ENTRY_PEEL_NEWPH]]:
+; CHECK-UR: br label %[[DO_BODY]]
+; CHECK-UR: [[DO_BODY]]:
+; CHECK-UR: call void @f
+; CHECK-UR: br i1 %{{.*}}, label %[[DO_END_LOOPEXIT:.*]], label %[[DO_BODY]], !prof ![[#PROF]], !llvm.loop ![[#LOOP_UR_LATCH:]]
+; CHECK-UR: [[DO_END_LOOPEXIT]]:
+; CHECK-UR: br label %[[DO_END]]
+; CHECK-UR: [[DO_END]]:
+; CHECK-UR: ret void
+
+entry:
+ br label %do.body
+
+do.body:
+ %i = phi i32 [ 0, %entry ], [ %inc, %do.body ]
+ %inc = add i32 %i, 1
+ call void @f(i32 %i)
+ %c = icmp sge i32 %inc, %n
+ br i1 %c, label %do.end, label %do.body, !prof !0
+
+do.end:
+ ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 9}
+
+; CHECK-UR: ![[#PROF]] = !{!"branch_weights", i32 1, i32 9}
+; CHECK-UR: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_PC:]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+; CHECK-UR: ![[#LOOP_UR_PC]] = !{!"llvm.loop.peeled.count", i32 2}
+; CHECK-UR: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 8}
+; CHECK-UR: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"}
diff --git a/llvm/test/Transforms/LoopUnroll/peel-branch-weights.ll b/llvm/test/Transforms/LoopUnroll/peel-branch-weights.ll
index c58f8f1..63a0dd4 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-branch-weights.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-branch-weights.ll
@@ -15,9 +15,9 @@ define void @test() {
; CHECK: loop.peel:
; CHECK-NEXT: [[X_PEEL:%.*]] = call i32 @get.x()
; CHECK-NEXT: switch i32 [[X_PEEL]], label [[LOOP_LATCH_PEEL:%.*]] [
-; CHECK-NEXT: i32 0, label [[LOOP_LATCH_PEEL]]
-; CHECK-NEXT: i32 1, label [[LOOP_EXIT:%.*]]
-; CHECK-NEXT: i32 2, label [[LOOP_EXIT]]
+; CHECK-NEXT: i32 0, label [[LOOP_LATCH_PEEL]]
+; CHECK-NEXT: i32 1, label [[LOOP_EXIT:%.*]]
+; CHECK-NEXT: i32 2, label [[LOOP_EXIT]]
; CHECK-NEXT: ], !prof [[PROF0:![0-9]+]]
; CHECK: loop.latch.peel:
; CHECK-NEXT: br label [[LOOP_PEEL_NEXT:%.*]]
@@ -26,10 +26,10 @@ define void @test() {
; CHECK: loop.peel2:
; CHECK-NEXT: [[X_PEEL3:%.*]] = call i32 @get.x()
; CHECK-NEXT: switch i32 [[X_PEEL3]], label [[LOOP_LATCH_PEEL4:%.*]] [
-; CHECK-NEXT: i32 0, label [[LOOP_LATCH_PEEL4]]
-; CHECK-NEXT: i32 1, label [[LOOP_EXIT]]
-; CHECK-NEXT: i32 2, label [[LOOP_EXIT]]
-; CHECK-NEXT: ], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT: i32 0, label [[LOOP_LATCH_PEEL4]]
+; CHECK-NEXT: i32 1, label [[LOOP_EXIT]]
+; CHECK-NEXT: i32 2, label [[LOOP_EXIT]]
+; CHECK-NEXT: ], !prof [[PROF0]]
; CHECK: loop.latch.peel4:
; CHECK-NEXT: br label [[LOOP_PEEL_NEXT1:%.*]]
; CHECK: loop.peel.next1:
@@ -41,31 +41,33 @@ define void @test() {
; CHECK: loop:
; CHECK-NEXT: [[X:%.*]] = call i32 @get.x()
; CHECK-NEXT: switch i32 [[X]], label [[LOOP_LATCH:%.*]] [
-; CHECK-NEXT: i32 0, label [[LOOP_LATCH]]
-; CHECK-NEXT: i32 1, label [[LOOP_EXIT_LOOPEXIT:%.*]]
-; CHECK-NEXT: i32 2, label [[LOOP_EXIT_LOOPEXIT]]
-; CHECK-NEXT: ], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT: i32 0, label [[LOOP_LATCH]]
+; CHECK-NEXT: i32 1, label [[LOOP_EXIT_LOOPEXIT:%.*]]
+; CHECK-NEXT: i32 2, label [[LOOP_EXIT_LOOPEXIT]]
+; CHECK-NEXT: ], !prof [[PROF0]]
; CHECK: loop.latch:
-; CHECK-NEXT: br label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br label [[LOOP]], !llvm.loop [[LOOP1:![0-9]+]]
; CHECK: loop.exit.loopexit:
; CHECK-NEXT: br label [[LOOP_EXIT]]
; CHECK: loop.exit:
; CHECK-NEXT: ret void
+;
+; DISABLEADV-LABEL: @test(
+; DISABLEADV-NEXT: entry:
+; DISABLEADV-NEXT: br label [[LOOP:%.*]]
+; DISABLEADV: loop:
+; DISABLEADV-NEXT: [[X:%.*]] = call i32 @get.x()
+; DISABLEADV-NEXT: switch i32 [[X]], label [[LOOP_LATCH:%.*]] [
+; DISABLEADV-NEXT: i32 0, label [[LOOP_LATCH]]
+; DISABLEADV-NEXT: i32 1, label [[LOOP_EXIT:%.*]]
+; DISABLEADV-NEXT: i32 2, label [[LOOP_EXIT]]
+; DISABLEADV-NEXT: ], !prof [[PROF0:![0-9]+]]
+; DISABLEADV: loop.latch:
+; DISABLEADV-NEXT: br label [[LOOP]]
+; DISABLEADV: loop.exit:
+; DISABLEADV-NEXT: ret void
+;
-; DISABLEADV-LABEL: @test()
-; DISABLEADV-NEXT: entry:
-; DISABLEADV-NEXT: br label %loop
-; DISABLEADV: loop
-; DISABLEADV-NEXT: %x = call i32 @get.x()
-; DISABLEADV-NEXT: switch i32 %x, label %loop.latch [
-; DISABLEADV-NEXT: i32 0, label %loop.latch
-; DISABLEADV-NEXT: i32 1, label %loop.exit
-; DISABLEADV-NEXT: i32 2, label %loop.exit
-; DISABLEADV-NEXT: ], !prof !0
-; DISABLEADV: loop.latch:
-; DISABLEADV-NEXT: br label %loop
-; DISABLEADV: loop.exit:
-; DISABLEADV-NEXT: ret void
entry:
br label %loop
@@ -89,9 +91,9 @@ loop.exit:
;.
; CHECK: [[PROF0]] = !{!"branch_weights", i32 100, i32 200, i32 20, i32 10}
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 90, i32 180, i32 20, i32 10}
-; CHECK: [[PROF2]] = !{!"branch_weights", i32 80, i32 160, i32 20, i32 10}
-; CHECK: [[LOOP3]] = distinct !{!3, !4, !5}
-; CHECK: [[META4:![0-9]+]] = !{!"llvm.loop.peeled.count", i32 2}
-; CHECK: [[META5:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
+; CHECK: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK: [[META2]] = !{!"llvm.loop.peeled.count", i32 2}
+; CHECK: [[META3]] = !{!"llvm.loop.unroll.disable"}
+;.
+; DISABLEADV: [[PROF0]] = !{!"branch_weights", i32 100, i32 200, i32 20, i32 10}
;.
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll
index d91cb5b..e951215 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll
@@ -15,13 +15,13 @@
; CHECK: br i1 %{{.*}}, label %[[NEXT0:.*]], label %for.cond.for.end_crit_edge, !prof !16
; CHECK: [[NEXT0]]:
; CHECK: br i1 %c, label %{{.*}}, label %side_exit, !prof !15
-; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !17
+; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !16
; CHECK: [[NEXT1]]:
; CHECK: br i1 %c, label %{{.*}}, label %side_exit, !prof !15
-; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !18
+; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !16
; CHECK: [[NEXT2]]:
; CHECK: br i1 %c, label %{{.*}}, label %side_exit.loopexit, !prof !15
-; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !18
+; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !16, !llvm.loop !17
define i32 @basic(ptr %p, i32 %k, i1 %c) #0 !prof !15 {
entry:
@@ -84,6 +84,7 @@ attributes #1 = { nounwind optsize }
;CHECK: !15 = !{!"branch_weights", i32 1, i32 0}
; This is a weights of latch and its copies.
;CHECK: !16 = !{!"branch_weights", i32 3001, i32 1001}
-;CHECK: !17 = !{!"branch_weights", i32 2000, i32 1001}
-;CHECK: !18 = !{!"branch_weights", i32 1001, i32 1001}
+;CHECK: !17 = distinct !{!17, !18, !19, {{.*}}}
+;CHECK: !18 = !{!"llvm.loop.peeled.count", i32 4}
+;CHECK: !19 = !{!"llvm.loop.estimated_trip_count", i32 0}
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll
index 15dce234..dec126f 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll
@@ -5,7 +5,7 @@
; RUN: opt < %s -S -profile-summary-huge-working-set-size-threshold=9 -debug-only=loop-unroll -passes='require<profile-summary>,function(require<opt-remark-emit>,loop-unroll)' 2>&1 | FileCheck %s --check-prefix=NOPEEL
; REQUIRES: asserts
-; Make sure we use the profile information correctly to peel-off 3 iterations
+; Make sure we use the profile information correctly to peel-off 4 iterations
; from the loop, and update the branch weights for the peeled loop properly.
; CHECK: Loop Unroll: F[basic]
@@ -20,11 +20,11 @@
; CHECK-LABEL: @basic
; CHECK: br i1 %{{.*}}, label %[[NEXT0:.*]], label %for.cond.for.end_crit_edge, !prof !15
; CHECK: [[NEXT0]]:
-; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !16
+; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !15
; CHECK: [[NEXT1]]:
-; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !17
+; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !15
; CHECK: [[NEXT2]]:
-; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !17
+; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !15, !llvm.loop !16
define void @basic(ptr %p, i32 %k) #0 !prof !15 {
entry:
@@ -104,6 +104,7 @@ attributes #1 = { nounwind optsize }
!16 = !{!"branch_weights", i32 3001, i32 1001}
;CHECK: !15 = !{!"branch_weights", i32 3001, i32 1001}
-;CHECK: !16 = !{!"branch_weights", i32 2000, i32 1001}
-;CHECK: !17 = !{!"branch_weights", i32 1001, i32 1001}
+;CHECK: !16 = distinct !{!16, !17, !18, {{.*}}}
+;CHECK: !17 = !{!"llvm.loop.peeled.count", i32 4}
+;CHECK: !18 = !{!"llvm.loop.estimated_trip_count", i32 0}
diff --git a/llvm/test/Transforms/LoopUnroll/scev-invalidation-lcssa.ll b/llvm/test/Transforms/LoopUnroll/scev-invalidation-lcssa.ll
index ec71c67..0a3d201 100644
--- a/llvm/test/Transforms/LoopUnroll/scev-invalidation-lcssa.ll
+++ b/llvm/test/Transforms/LoopUnroll/scev-invalidation-lcssa.ll
@@ -3,7 +3,7 @@
define i32 @f(i1 %cond1) #0 !prof !0 {
; CHECK-LABEL: define i32 @f
-; CHECK-SAME: (i1 [[COND1:%.*]]) !prof [[PROF0:![0-9]+]] {
+; CHECK-SAME: (i1 [[COND1:%.*]]) {{.*}}{
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[LOOP1_PEEL_BEGIN:%.*]]
; CHECK: loop1.peel.begin:
@@ -19,7 +19,7 @@ define i32 @f(i1 %cond1) #0 !prof !0 {
; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop1:
; CHECK-NEXT: [[LD:%.*]] = load i64, ptr null, align 8
-; CHECK-NEXT: br i1 [[COND1]], label [[LOOP1]], label [[EXIT1_LOOPEXIT:%.*]], !prof [[PROF2:![0-9]+]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[COND1]], label [[LOOP1]], label [[EXIT1_LOOPEXIT:%.*]], !prof [[PROF1]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: exit1.loopexit:
; CHECK-NEXT: [[LD_LCSSA_PH:%.*]] = phi i64 [ [[LD]], [[LOOP1]] ]
; CHECK-NEXT: br label [[EXIT1]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
index 32fdc5cd6..56a1abd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
@@ -113,3 +113,49 @@ loop:
exit:
ret float %max.next
}
+
+define float @test_fmax_and_fmin(ptr %src.0, ptr %src.1, i64 %n) {
+; CHECK-LABEL: define float @test_fmax_and_fmin(
+; CHECK-SAME: ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV]]
+; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV]]
+; CHECK-NEXT: [[L_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4
+; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT: [[MAX_NEXT]] = tail call noundef float @llvm.maxnum.f32(float [[MAX]], float [[L_0]])
+; CHECK-NEXT: [[MIN_NEXT]] = tail call noundef float @llvm.minnum.f32(float [[MIN]], float [[L_1]])
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[SUB:%.*]] = fsub float [[MAX_NEXT_LCSSA]], [[MIN_NEXT_LCSSA]]
+; CHECK-NEXT: ret float [[SUB]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %min = phi float [ 0.000000e+00, %entry ], [ %min.next, %loop ]
+ %max = phi float [ 0.000000e+00, %entry ], [ %max.next, %loop ]
+ %gep.src.0 = getelementptr inbounds nuw float, ptr %src.0, i64 %iv
+ %gep.src.1 = getelementptr inbounds nuw float, ptr %src.1, i64 %iv
+ %l.0 = load float, ptr %gep.src.0, align 4
+ %l.1 = load float, ptr %gep.src.1, align 4
+ %max.next = tail call noundef float @llvm.maxnum.f32(float %max, float %l.0)
+ %min.next = tail call noundef float @llvm.minnum.f32(float %min, float %l.1)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ %sub = fsub float %max.next, %min.next
+ ret float %sub
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
index 0086f6e..b033f60 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
@@ -20,22 +20,22 @@ define i32 @red_zext_mul_by_63(ptr %start, ptr %end) {
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], splat (i32 63)
-; CHECK-NEXT: [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[TMP4]]
+; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[GEP_IV_NEXT:%.*]], %[[LOOP]] ]
@@ -48,7 +48,7 @@ define i32 @red_zext_mul_by_63(ptr %start, ptr %end) {
; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]]
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[RED_NEXT_LCSSA]]
;
entry:
@@ -86,17 +86,17 @@ define i32 @red_zext_mul_by_255(ptr %start, ptr %end) {
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], splat (i32 255)
-; CHECK-NEXT: [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[TMP4]]
+; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
@@ -218,22 +218,22 @@ define i32 @red_sext_mul_by_63(ptr %start, ptr %end) {
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], splat (i32 63)
-; CHECK-NEXT: [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[TMP4]]
+; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[GEP_IV_NEXT:%.*]], %[[LOOP]] ]
@@ -246,7 +246,7 @@ define i32 @red_sext_mul_by_63(ptr %start, ptr %end) {
; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]]
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[RED_NEXT_LCSSA]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll
new file mode 100644
index 0000000..e97d6e66d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll
@@ -0,0 +1,244 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S %s | FileCheck --check-prefix=VF2IC1 %s
+; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S %s | FileCheck --check-prefix=VF2IC2 %s
+; RUN: opt -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S %s | FileCheck --check-prefix=VF1IC2 %s
+
+define i32 @FOR_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) {
+; VF2IC1-LABEL: define i32 @FOR_used_outside(
+; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF2IC1-NEXT: [[ENTRY:.*]]:
+; VF2IC1-NEXT: br label %[[LOOP:.*]]
+; VF2IC1: [[LOOP]]:
+; VF2IC1-NEXT: [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
+; VF2IC1-NEXT: [[TMP10]] = load i32, ptr [[TMP9]], align 4
+; VF2IC1-NEXT: [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]]
+; VF2IC1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT: store i32 [[TMP23]], ptr [[TMP20]], align 4
+; VF2IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1
+; VF2IC1-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2IC1-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC1: [[FOR_END]]:
+; VF2IC1-NEXT: [[TMP32:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
+; VF2IC1-NEXT: ret i32 [[TMP32]]
+;
+; VF2IC2-LABEL: define i32 @FOR_used_outside(
+; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF2IC2-NEXT: [[ENTRY:.*]]:
+; VF2IC2-NEXT: br label %[[LOOP:.*]]
+; VF2IC2: [[LOOP]]:
+; VF2IC2-NEXT: [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
+; VF2IC2-NEXT: [[TMP23]] = load i32, ptr [[TMP22]], align 4
+; VF2IC2-NEXT: [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]]
+; VF2IC2-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT: store i32 [[TMP47]], ptr [[TMP44]], align 4
+; VF2IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1
+; VF2IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC2: [[FOR_END]]:
+; VF2IC2-NEXT: [[TMP66:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
+; VF2IC2-NEXT: ret i32 [[TMP66]]
+;
+; VF1IC2-LABEL: define i32 @FOR_used_outside(
+; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF1IC2-NEXT: [[ENTRY:.*]]:
+; VF1IC2-NEXT: br label %[[LOOP:.*]]
+; VF1IC2: [[LOOP]]:
+; VF1IC2-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]]
+; VF1IC2-NEXT: [[TMP7]] = load i32, ptr [[TMP6]], align 4
+; VF1IC2-NEXT: [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]]
+; VF1IC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT: store i32 [[TMP12]], ptr [[TMP11]], align 4
+; VF1IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1
+; VF1IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF1IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF1IC2: [[FOR_END]]:
+; VF1IC2-NEXT: [[TMP30:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
+; VF1IC2-NEXT: ret i32 [[TMP30]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %for = phi i32 [ 33, %entry ], [ %for.next, %loop ]
+ %gep.A = getelementptr inbounds nuw i32, ptr %A, i64 %iv
+ %for.next = load i32, ptr %gep.A, align 4
+ %add = add nsw i32 %for, %for.next
+ %gep.B = getelementptr inbounds nuw i32, ptr %B, i64 %iv
+ store i32 %add, ptr %gep.B, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %for.end, label %loop
+
+for.end:
+ ret i32 %for
+}
+
+define i32 @FOR_next_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) {
+; VF2IC1-LABEL: define i32 @FOR_next_used_outside(
+; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF2IC1-NEXT: [[ENTRY:.*]]:
+; VF2IC1-NEXT: br label %[[LOOP:.*]]
+; VF2IC1: [[LOOP]]:
+; VF2IC1-NEXT: [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
+; VF2IC1-NEXT: [[TMP10]] = load i32, ptr [[TMP9]], align 4
+; VF2IC1-NEXT: [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]]
+; VF2IC1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT: store i32 [[TMP23]], ptr [[TMP20]], align 4
+; VF2IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1
+; VF2IC1-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2IC1-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC1: [[FOR_END]]:
+; VF2IC1-NEXT: [[TMP28:%.*]] = phi i32 [ [[TMP10]], %[[LOOP]] ]
+; VF2IC1-NEXT: ret i32 [[TMP28]]
+;
+; VF2IC2-LABEL: define i32 @FOR_next_used_outside(
+; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF2IC2-NEXT: [[ENTRY:.*]]:
+; VF2IC2-NEXT: br label %[[LOOP:.*]]
+; VF2IC2: [[LOOP]]:
+; VF2IC2-NEXT: [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
+; VF2IC2-NEXT: [[TMP23]] = load i32, ptr [[TMP22]], align 4
+; VF2IC2-NEXT: [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]]
+; VF2IC2-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT: store i32 [[TMP47]], ptr [[TMP44]], align 4
+; VF2IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1
+; VF2IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC2: [[FOR_END]]:
+; VF2IC2-NEXT: [[TMP62:%.*]] = phi i32 [ [[TMP23]], %[[LOOP]] ]
+; VF2IC2-NEXT: ret i32 [[TMP62]]
+;
+; VF1IC2-LABEL: define i32 @FOR_next_used_outside(
+; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF1IC2-NEXT: [[ENTRY:.*]]:
+; VF1IC2-NEXT: br label %[[LOOP:.*]]
+; VF1IC2: [[LOOP]]:
+; VF1IC2-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]]
+; VF1IC2-NEXT: [[TMP7]] = load i32, ptr [[TMP6]], align 4
+; VF1IC2-NEXT: [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]]
+; VF1IC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT: store i32 [[TMP12]], ptr [[TMP11]], align 4
+; VF1IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1
+; VF1IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF1IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF1IC2: [[FOR_END]]:
+; VF1IC2-NEXT: [[TMP27:%.*]] = phi i32 [ [[TMP7]], %[[LOOP]] ]
+; VF1IC2-NEXT: ret i32 [[TMP27]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %for = phi i32 [ 33, %entry ], [ %for.next, %loop ]
+ %gep.A = getelementptr inbounds nuw i32, ptr %A, i64 %iv
+ %for.next = load i32, ptr %gep.A, align 4
+ %add = add nsw i32 %for, %for.next
+ %gep.B = getelementptr inbounds nuw i32, ptr %B, i64 %iv
+ store i32 %add, ptr %gep.B, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %for.end, label %loop
+
+for.end:
+ ret i32 %for.next
+}
+
+define i32 @FOR_and_next_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) {
+; VF2IC1-LABEL: define i32 @FOR_and_next_used_outside(
+; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF2IC1-NEXT: [[ENTRY:.*]]:
+; VF2IC1-NEXT: br label %[[LOOP:.*]]
+; VF2IC1: [[LOOP]]:
+; VF2IC1-NEXT: [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
+; VF2IC1-NEXT: [[TMP10]] = load i32, ptr [[TMP9]], align 4
+; VF2IC1-NEXT: [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]]
+; VF2IC1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT: store i32 [[TMP23]], ptr [[TMP20]], align 4
+; VF2IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1
+; VF2IC1-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2IC1-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC1: [[FOR_END]]:
+; VF2IC1-NEXT: [[TMP32:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
+; VF2IC1-NEXT: [[TMP33:%.*]] = phi i32 [ [[TMP10]], %[[LOOP]] ]
+; VF2IC1-NEXT: [[RES:%.*]] = add i32 [[TMP32]], [[TMP33]]
+; VF2IC1-NEXT: ret i32 [[RES]]
+;
+; VF2IC2-LABEL: define i32 @FOR_and_next_used_outside(
+; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF2IC2-NEXT: [[ENTRY:.*]]:
+; VF2IC2-NEXT: br label %[[LOOP:.*]]
+; VF2IC2: [[LOOP]]:
+; VF2IC2-NEXT: [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
+; VF2IC2-NEXT: [[TMP23]] = load i32, ptr [[TMP22]], align 4
+; VF2IC2-NEXT: [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]]
+; VF2IC2-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT: store i32 [[TMP47]], ptr [[TMP44]], align 4
+; VF2IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1
+; VF2IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC2: [[FOR_END]]:
+; VF2IC2-NEXT: [[TMP66:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
+; VF2IC2-NEXT: [[TMP71:%.*]] = phi i32 [ [[TMP23]], %[[LOOP]] ]
+; VF2IC2-NEXT: [[RES:%.*]] = add i32 [[TMP66]], [[TMP71]]
+; VF2IC2-NEXT: ret i32 [[RES]]
+;
+; VF1IC2-LABEL: define i32 @FOR_and_next_used_outside(
+; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF1IC2-NEXT: [[ENTRY:.*]]:
+; VF1IC2-NEXT: br label %[[LOOP:.*]]
+; VF1IC2: [[LOOP]]:
+; VF1IC2-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]]
+; VF1IC2-NEXT: [[TMP7]] = load i32, ptr [[TMP6]], align 4
+; VF1IC2-NEXT: [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]]
+; VF1IC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT: store i32 [[TMP12]], ptr [[TMP11]], align 4
+; VF1IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1
+; VF1IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF1IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF1IC2: [[FOR_END]]:
+; VF1IC2-NEXT: [[TMP30:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
+; VF1IC2-NEXT: [[TMP33:%.*]] = phi i32 [ [[TMP7]], %[[LOOP]] ]
+; VF1IC2-NEXT: [[RES:%.*]] = add i32 [[TMP30]], [[TMP33]]
+; VF1IC2-NEXT: ret i32 [[RES]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %for = phi i32 [ 33, %entry ], [ %for.next, %loop ]
+ %gep.A = getelementptr inbounds nuw i32, ptr %A, i64 %iv
+ %for.next = load i32, ptr %gep.A, align 4
+ %add = add nsw i32 %for, %for.next
+ %gep.B = getelementptr inbounds nuw i32, ptr %B, i64 %iv
+ store i32 %add, ptr %gep.B, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %for.end, label %loop
+
+for.end:
+ %res = add i32 %for, %for.next
+ ret i32 %res
+}
+
+
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
index 616f156..5b7c27a 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
@@ -113,3 +113,49 @@ loop:
exit:
ret float %max.next
}
+
+define float @test_fmax_and_fmin(ptr %src.0, ptr %src.1, i64 %n) {
+; CHECK-LABEL: define float @test_fmax_and_fmin(
+; CHECK-SAME: ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV]]
+; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV]]
+; CHECK-NEXT: [[L_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4
+; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT: [[MAX_NEXT]] = tail call noundef float @llvm.maxnum.f32(float [[MAX]], float [[L_0]])
+; CHECK-NEXT: [[MIN_NEXT]] = tail call noundef float @llvm.minnum.f32(float [[MIN]], float [[L_1]])
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[SUB:%.*]] = fsub float [[MAX_NEXT_LCSSA]], [[MIN_NEXT_LCSSA]]
+; CHECK-NEXT: ret float [[SUB]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %min = phi float [ 0.000000e+00, %entry ], [ %min.next, %loop ]
+ %max = phi float [ 0.000000e+00, %entry ], [ %max.next, %loop ]
+ %gep.src.0 = getelementptr inbounds nuw float, ptr %src.0, i64 %iv
+ %gep.src.1 = getelementptr inbounds nuw float, ptr %src.1, i64 %iv
+ %l.0 = load float, ptr %gep.src.0, align 4
+ %l.1 = load float, ptr %gep.src.1, align 4
+ %max.next = tail call noundef float @llvm.maxnum.f32(float %max, float %l.0)
+ %min.next = tail call noundef float @llvm.minnum.f32(float %min, float %l.1)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ %sub = fsub float %max.next, %min.next
+ ret float %sub
+}
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
index 1a2b233..8b6a6e1 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
@@ -683,3 +683,49 @@ loop:
exit:
ret float %max.next
}
+
+define float @test_fmax_and_fmax(ptr %src.0, ptr %src.1, i64 %n) {
+; CHECK-LABEL: define float @test_fmax_and_fmax(
+; CHECK-SAME: ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV]]
+; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV]]
+; CHECK-NEXT: [[L_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4
+; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT: [[MAX_NEXT]] = tail call noundef float @llvm.maxnum.f32(float [[MAX]], float [[L_0]])
+; CHECK-NEXT: [[MIN_NEXT]] = tail call noundef float @llvm.minnum.f32(float [[MIN]], float [[L_1]])
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[SUB:%.*]] = fsub float [[MAX_NEXT_LCSSA]], [[MIN_NEXT_LCSSA]]
+; CHECK-NEXT: ret float [[SUB]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %min = phi float [ 0.000000e+00, %entry ], [ %min.next, %loop ]
+ %max = phi float [ 0.000000e+00, %entry ], [ %max.next, %loop ]
+ %gep.src.0 = getelementptr inbounds nuw float, ptr %src.0, i64 %iv
+ %gep.src.1 = getelementptr inbounds nuw float, ptr %src.1, i64 %iv
+ %l.0 = load float, ptr %gep.src.0, align 4
+ %l.1 = load float, ptr %gep.src.1, align 4
+ %max.next = tail call noundef float @llvm.maxnum.f32(float %max, float %l.0)
+ %min.next = tail call noundef float @llvm.minnum.f32(float %min, float %l.1)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ %sub = fsub float %max.next, %min.next
+ ret float %sub
+}
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-order.ll b/llvm/test/Transforms/LoopVectorize/reduction-order.ll
index b07c3833..b51db48 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-order.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-order.ll
@@ -1,63 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
; RUN: opt -passes='loop-vectorize' -force-vector-width=4 -force-vector-interleave=1 -S < %s 2>&1 | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
; Make sure the selects generated from reduction are always emitted
; in deterministic order.
-; CHECK-LABEL: @foo(
-; CHECK: vector.body:
-; CHECK: [[VEC_PHI_1:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD_5:%.+]], %vector.body ]
-; CHECK: [[VEC_PHI_2:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD_3:%.+]], %vector.body ]
-; CHECK: icmp ule <4 x i64>
-; CHECK-NEXT: [[ADD_3]] = add <4 x i32> splat (i32 3), [[VEC_PHI_2]]
-; CHECK-NEXT: [[ADD_5]] = add <4 x i32> [[VEC_PHI_1]], splat (i32 5)
-; CHECK: select <4 x i1> {{.*}}, <4 x i32> [[ADD_5]], <4 x i32>
-; CHECK-NEXT: select <4 x i1> {{.*}}, <4 x i32> [[ADD_3]], <4 x i32>
-; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
;
-define internal i64 @foo(ptr %t0) !prof !1 {
-t16:
- br label %t20
-
-t17: ; preds = %t20
- %t18 = phi i32 [ %t24, %t20 ]
- %t19 = phi i32 [ %t28, %t20 ]
- br label %t31
+define i32 @foo() !prof !1 {
+; CHECK-LABEL: define i32 @foo() {{.*}}{
+; CHECK-NEXT: [[T16:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI_1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[ADD_5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI_2:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[ADD_3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 9)
+; CHECK-NEXT: [[ADD_3]] = add <4 x i32> splat (i32 3), [[VEC_PHI_2]]
+; CHECK-NEXT: [[ADD_5]] = add <4 x i32> [[VEC_PHI_1]], splat (i32 5)
+; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[ADD_5]], <4 x i32> [[VEC_PHI_1]]
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[ADD_3]], <4 x i32> [[VEC_PHI_2]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT: ret i32 [[ADD]]
+;
+entry:
+ br label %loop
-t20: ; preds = %t20, %t16
- %t21 = phi i64 [ 0, %t16 ], [ %t29, %t20 ]
- %t22 = phi i32 [ 0, %t16 ], [ %t28, %t20 ]
- %t23 = phi i32 [ 0, %t16 ], [ %t24, %t20 ]
- %t24 = add i32 3, %t23
- %t28 = add i32 %t22, 5
- %t29 = add nuw nsw i64 %t21, 1
- %t30 = icmp eq i64 %t29, 10
- br i1 %t30, label %t17, label %t20, !prof !2
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %red.1 = phi i32 [ 0, %entry ], [ %red.1.next, %loop ]
+ %red.2 = phi i32 [ 0, %entry ], [ %red.2.next, %loop ]
+ %red.2.next = add i32 3, %red.2
+ %red.1.next = add i32 %red.1, 5
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, 10
+ br i1 %ec, label %exit, label %loop, !prof !2
-t31:
- ret i64 undef
+exit:
+ %r.2 = phi i32 [ %red.2.next, %loop ]
+ %r.1 = phi i32 [ %red.1.next, %loop ]
+ %add = add i32 %r.2, %r.1
+ ret i32 %add
}
; Make sure we do not fail when checking for ordered reduction. This test just
; exercises the path and bails out without performing vectorization.
-; CHECK-LABEL: quux
-; CHECK-NOT: fadd <4 x
-define void @quux(i1 %arg) {
-bb:
+define double @quux(i1 %arg) {
+; CHECK-LABEL: define double @quux(
+; CHECK-SAME: i1 [[ARG:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[HEADER:.*]]
+; CHECK: [[HEADER]]:
+; CHECK-NEXT: [[TMP5:%.*]] = phi double [ 1.300000e+01, %[[ENTRY]] ], [ [[TMP:%.*]], %[[LATCH:.*]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = fadd double [[TMP5]], 1.000000e+00
+; CHECK-NEXT: br label %[[LATCH]]
+; CHECK: [[LATCH]]:
+; CHECK-NEXT: [[TMP]] = phi double [ [[TMP6]], %[[HEADER]] ]
+; CHECK-NEXT: br i1 [[ARG]], label %[[HEADER]], label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[R:%.*]] = phi double [ [[TMP]], %[[LATCH]] ]
+; CHECK-NEXT: ret double [[R]]
+;
+entry:
br label %header
-latch: ; preds = %header
- %tmp = phi double [ %tmp6, %header ]
- br i1 %arg, label %header, label %bb2
-
-bb2: ; preds = %latch
- %tmp3 = phi double [ %tmp, %latch ]
- ret void
-
-header: ; preds = %latch, %bb
- %tmp5 = phi double [ 1.300000e+01, %bb ], [ %tmp, %latch ]
+header:
+ %tmp5 = phi double [ 1.300000e+01, %entry ], [ %tmp, %latch ]
%tmp6 = fadd double %tmp5, 1.000000e+00
br label %latch
+
+latch:
+ %tmp = phi double [ %tmp6, %header ]
+ br i1 %arg, label %header, label %exit
+
+exit:
+ %r = phi double [ %tmp, %latch ]
+ ret double %r
}
!1 = !{!"function_entry_count", i64 801}
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
index 644900d..9620697 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
@@ -500,3 +500,50 @@ exit:
%first.addr.0.lcssa.i = phi ptr [ %first, %entry ], [ %iv, %loop.header ], [ %iv.next, %loop.latch ]
ret ptr %first.addr.0.lcssa.i
}
+
+define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_size_nofree_via_context(ptr noalias %p1, ptr noalias %p2) nosync {
+; CHECK-LABEL: define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_size_nofree_via_context(
+; CHECK-SAME: ptr noalias [[P1:%.*]], ptr noalias [[P2:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[P1]], i64 4), "dereferenceable"(ptr [[P1]], i64 1024) ]
+; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[P2]], i64 4), "dereferenceable"(ptr [[P2]], i64 1024) ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[LOOP_INC:.*]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX1]]
+; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX1]]
+; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT: br i1 [[CMP3]], label %[[LOOP_INC]], label %[[LOOP_END:.*]]
+; CHECK: [[LOOP_INC]]:
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX1]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[LOOP_END]]
+; CHECK: [[LOOP_END]]:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX1]], %[[LOOP]] ], [ -1, %[[LOOP_INC]] ]
+; CHECK-NEXT: ret i64 [[RETVAL]]
+;
+entry:
+ call void @llvm.assume(i1 true) [ "align"(ptr %p1, i64 4), "dereferenceable"(ptr %p1, i64 1024) ]
+ call void @llvm.assume(i1 true) [ "align"(ptr %p2, i64 4), "dereferenceable"(ptr %p2, i64 1024) ]
+ br label %loop
+
+loop:
+ %index = phi i64 [ %index.next, %loop.inc ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+ %ld1 = load i8, ptr %arrayidx, align 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+ %ld2 = load i8, ptr %arrayidx1, align 1
+ %cmp3 = icmp eq i8 %ld1, %ld2
+ br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+ %index.next = add i64 %index, 1
+ %exitcond = icmp ne i64 %index.next, 1024
+ br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+ %retval = phi i64 [ %index, %loop ], [ -1, %loop.inc ]
+ ret i64 %retval
+}
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index e09ddb4..731d648 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -1636,7 +1636,11 @@ static std::pair<Triple, SubtargetFeatures> getFirstFileTripleAndFeatures() {
case file_magic::macho_object: {
auto Obj = ExitOnErr(
object::ObjectFile::createObjectFile(ObjBuffer->getMemBufferRef()));
- Triple TT = Obj->makeTriple();
+ Triple TT;
+ if (auto *MachOObj = dyn_cast<object::MachOObjectFile>(Obj.get()))
+ TT = MachOObj->getArchTriple();
+ else
+ TT = Obj->makeTriple();
if (Magic == file_magic::coff_object) {
// TODO: Move this to makeTriple() if possible.
TT.setObjectFormat(Triple::COFF);
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index 141282e..30f0a8e5 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -10176,4 +10176,11 @@ TEST(APFloatTest, hasSignBitInMSB) {
EXPECT_FALSE(APFloat::hasSignBitInMSB(APFloat::Float8E8M0FNU()));
}
+TEST(APFloatTest, FrexpQuietSNaN) {
+ APFloat SNaN = APFloat::getSNaN(APFloat::PPCDoubleDouble());
+ int Exp;
+ APFloat Result = frexp(SNaN, Exp, APFloat::rmNearestTiesToEven);
+ EXPECT_FALSE(Result.isSignaling());
+}
+
} // namespace
diff --git a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
index d1c0f64..d8457a3 100644
--- a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
+++ b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
@@ -230,8 +230,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
CallBase *Call = findCall(*Func, "call");
Trie.buildAndAttachMIBMetadata(Call);
- EXPECT_TRUE(Call->hasFnAttr("memprof"));
- EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
+ EXPECT_FALSE(Call->hasFnAttr("memprof"));
EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
@@ -280,8 +279,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
CallBase *Call = findCall(*Func, "call");
Trie.buildAndAttachMIBMetadata(Call);
- EXPECT_TRUE(Call->hasFnAttr("memprof"));
- EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
+ EXPECT_FALSE(Call->hasFnAttr("memprof"));
EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
@@ -335,8 +333,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
CallBase *Call = findCall(*Func, "call");
Trie.buildAndAttachMIBMetadata(Call);
- EXPECT_TRUE(Call->hasFnAttr("memprof"));
- EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
+ EXPECT_FALSE(Call->hasFnAttr("memprof"));
EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
@@ -395,8 +392,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
CallBase *Call = findCall(*Func, "call");
Trie.buildAndAttachMIBMetadata(Call);
- EXPECT_TRUE(Call->hasFnAttr("memprof"));
- EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
+ EXPECT_FALSE(Call->hasFnAttr("memprof"));
EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
@@ -467,8 +463,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
ASSERT_NE(Call, nullptr);
Trie.buildAndAttachMIBMetadata(Call);
- EXPECT_TRUE(Call->hasFnAttr("memprof"));
- EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
+ EXPECT_FALSE(Call->hasFnAttr("memprof"));
EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
EXPECT_THAT(MemProfMD, MemprofMetadataEquals(ExpectedVals));
@@ -541,8 +536,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
// Restore original option value.
MemProfKeepAllNotColdContexts = OrigMemProfKeepAllNotColdContexts;
- EXPECT_TRUE(Call->hasFnAttr("memprof"));
- EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
+ EXPECT_FALSE(Call->hasFnAttr("memprof"));
EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
EXPECT_THAT(MemProfMD, MemprofMetadataEquals(ExpectedVals));
@@ -670,8 +664,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
// The hot allocations will be converted to NotCold and pruned as they
// are unnecessary to determine how to clone the cold allocation.
- EXPECT_TRUE(Call->hasFnAttr("memprof"));
- EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
+ EXPECT_FALSE(Call->hasFnAttr("memprof"));
EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
diff --git a/llvm/unittests/Frontend/CMakeLists.txt b/llvm/unittests/Frontend/CMakeLists.txt
index 836a844..1ce34e7 100644
--- a/llvm/unittests/Frontend/CMakeLists.txt
+++ b/llvm/unittests/Frontend/CMakeLists.txt
@@ -1,5 +1,6 @@
set(LLVM_LINK_COMPONENTS
Analysis
+ BinaryFormat
Core
FrontendHLSL
FrontendOffloading
diff --git a/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn
index b856d1c..764ebb9 100644
--- a/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn
@@ -28,6 +28,7 @@ static_library("Rewrite") {
"BuildIDRewriter.cpp",
"DWARFRewriter.cpp",
"ExecutableFileMemoryManager.cpp",
+ "GNUPropertyRewriter.cpp",
"JITLinkLinker.cpp",
"LinuxKernelRewriter.cpp",
"MachORewriteInstance.cpp",