aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAmara Emerson <amara@apple.com>2026-03-17 13:25:40 -0700
committerAmara Emerson <amara@apple.com>2026-03-18 15:37:29 -0700
commit6ae8c3dee3d367cde335c235bc1ebc3a2046929b (patch)
treed8b5722abfb3e6a33852da4badb7c20e9b879349
parent9da068b7cfa269d736d5261b8c82b701a718e531 (diff)
downloadllvm-users/amara/expandpseudo-multivec.tar.gz
llvm-users/amara/expandpseudo-multivec.tar.bz2
llvm-users/amara/expandpseudo-multivec.zip
[AArch64][SVE] Use multi-vector spill/fill instructions in pseudo expansionusers/amara/expandpseudo-multivec
SVE tuple spill/fill pseudos (STR_ZZXI, LDR_ZZZZXI, etc.) currently expand into N individual STR_ZXI/LDR_ZXI instructions. On targets with SVE2.1 or SME2 we can directly generate a ptrue + multi-vec op. For the x2 case this is neutral in code size but can still be beneficial depending on your uarch. I'm going to assume this is a generally good thing for now. rdar://168475826
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp167
-rw-r--r--llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/sve-expand-multivec-spill-fill.mir534
-rw-r--r--llvm/test/CodeGen/AArch64/sve-multivec-spill-fill.ll351
5 files changed, 1058 insertions, 6 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 4b3353e54797..23b1c5e1fb79 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -33,6 +33,7 @@
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Pass.h"
#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/TargetParser/Triple.h"
#include <cassert>
@@ -41,6 +42,10 @@
using namespace llvm;
+static cl::opt<bool> EnableMultiVecSpillFill(
+ "aarch64-enable-multivec-spill-fill", cl::init(true), cl::Hidden,
+ cl::desc("Enable multi-vector spill/fill expansion for SVE"));
+
#define AARCH64_EXPAND_PSEUDO_NAME "AArch64 pseudo instruction expansion pass"
namespace {
@@ -90,6 +95,11 @@ private:
bool expandSVESpillFill(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, unsigned Opc,
unsigned N);
+ bool tryExpandSVESpillFillToMultiVec(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned MultiVecOpc, unsigned N,
+ LivePhysRegs &LiveRegs);
+ bool expandSVEMultiVecSpillFills(MachineBasicBlock &MBB);
bool expandCALL_RVMARKER(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI);
bool expandCALL_BTI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
@@ -111,6 +121,10 @@ private:
MachineBasicBlock::iterator MBBI);
MachineBasicBlock *expandCondSMToggle(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI);
+
+ // Cached scratch PNR register for multi-vector spill/fill optimization.
+ // Reset at the start of each block.
+ Register CachedScratchPNR = AArch64::NoRegister;
};
} // end anonymous namespace
@@ -859,6 +873,74 @@ bool AArch64ExpandPseudo::expandSetTagLoop(
return true;
}
+/// Try to use multi-vector instructions to replace the tuple load/stores,
+/// returns true if successful.
+bool AArch64ExpandPseudo::tryExpandSVESpillFillToMultiVec(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ unsigned MultiVecOpc, unsigned N, LivePhysRegs &LiveRegs) {
+ MachineFunction &MF = *MBB.getParent();
+ const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+ const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+ MachineInstr &MI = *MBBI;
+ Register TupleReg = MI.getOperand(0).getReg();
+ Register BaseReg = MI.getOperand(1).getReg();
+ int BaseOffset = MI.getOperand(2).getImm();
+ DebugLoc DL = MI.getDebugLoc();
+
+ // Check immediate range for multi-vector instructions.
+ // simm4s2 for x2: -8 to +7 (scaled by 2)
+ // simm4s4 for x4: -8 to +7 (scaled by 4)
+ int Scale = N == 4 ? 4 : 2;
+ int ScaledOffset = BaseOffset / Scale;
+ if ((BaseOffset % Scale != 0) || ScaledOffset < -8 || ScaledOffset > 7)
+ return false;
+
+ // Find a scratch PNR register that is not live at this point.
+ // First check if our cached register is still usable.
+ Register ScratchPNR = CachedScratchPNR;
+
+ if (ScratchPNR == AArch64::NoRegister || LiveRegs.contains(ScratchPNR)) {
+ ScratchPNR = AArch64::NoRegister;
+ for (MCPhysReg Reg :
+ {AArch64::PN8, AArch64::PN9, AArch64::PN10, AArch64::PN11,
+ AArch64::PN12, AArch64::PN13, AArch64::PN14, AArch64::PN15}) {
+ if (!LiveRegs.contains(Reg)) {
+ ScratchPNR = Reg;
+ break;
+ }
+ }
+ if (ScratchPNR == AArch64::NoRegister)
+ return false; // No available scratch register, bail out.
+ CachedScratchPNR = ScratchPNR;
+ }
+
+ // Get the register tuple for the multi-vector instruction.
+ Register FirstReg = TRI->getSubReg(TupleReg, AArch64::zsub0);
+ unsigned RegNum = FirstReg - AArch64::Z0;
+ Register MultiVecTupleReg =
+ (N == 4) ? AArch64::Z0_Z1_Z2_Z3 + RegNum : AArch64::Z0_Z1 + RegNum;
+
+ bool IsLoad = (MultiVecOpc == AArch64::LD1B_2Z_IMM ||
+ MultiVecOpc == AArch64::LD1B_4Z_IMM);
+ bool Kill = MI.getOperand(1).isKill();
+
+ // Insert PTRUE_C_B to set up the predicate.
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::PTRUE_C_B), ScratchPNR);
+
+ // Build the multi-vector instruction.
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(MultiVecOpc));
+ if (IsLoad)
+ MIB.addReg(MultiVecTupleReg, RegState::Define);
+ else
+ MIB.addReg(MultiVecTupleReg);
+ MIB.addReg(ScratchPNR, RegState::Kill);
+ MIB.addReg(BaseReg, getKillRegState(Kill));
+ MIB.addImm(ScaledOffset);
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
unsigned Opc, unsigned N) {
@@ -2005,11 +2087,96 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
return false;
}
+/// Map SVE tuple spill/fill pseudo opcodes to multi-vector opcodes.
+/// Returns {MultiVecOpc, N} or {0, 0} if not a candidate.
+/// Only ZPR (not PPR) spills with contiguous, properly-aligned registers
+/// can use multi-vector instructions.
+static std::pair<unsigned, unsigned>
+getSVEMultiVecSpillFillInfo(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AArch64::STR_ZZZZXI:
+ case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
+ if (AArch64::ZPR4Mul4RegClass.contains(MI.getOperand(0).getReg()))
+ return {AArch64::ST1B_4Z_IMM, 4};
+ return {0, 0};
+ case AArch64::STR_ZZXI:
+ case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
+ if (AArch64::ZPR2Mul2RegClass.contains(MI.getOperand(0).getReg()))
+ return {AArch64::ST1B_2Z_IMM, 2};
+ return {0, 0};
+ case AArch64::LDR_ZZZZXI:
+ case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
+ if (AArch64::ZPR4Mul4RegClass.contains(MI.getOperand(0).getReg()))
+ return {AArch64::LD1B_4Z_IMM, 4};
+ return {0, 0};
+ case AArch64::LDR_ZZXI:
+ case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
+ if (AArch64::ZPR2Mul2RegClass.contains(MI.getOperand(0).getReg()))
+ return {AArch64::LD1B_2Z_IMM, 2};
+ return {0, 0};
+ default:
+ return {0, 0};
+ }
+}
+
+/// Try to expand SVE spill/fill pseudos into multi-vector instructions.
+/// Iterates backward through the block with incremental liveness to find
+/// free PNR scratch registers, avoiding O(n^2) liveness recomputation.
+bool AArch64ExpandPseudo::expandSVEMultiVecSpillFills(MachineBasicBlock &MBB) {
+ if (!EnableMultiVecSpillFill || MBB.empty())
+ return false;
+
+ MachineFunction &MF = *MBB.getParent();
+ const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+ if (!STI.hasSVE2p1() && !(STI.hasSME2() && STI.isStreaming()))
+ return false;
+
+ bool Modified = false;
+ const TargetRegisterInfo *TRI =
+ MBB.getParent()->getSubtarget().getRegisterInfo();
+ LivePhysRegs LiveRegs(*TRI);
+ LiveRegs.addLiveOuts(MBB);
+ auto MBBI = MBB.end();
+ while (MBBI != MBB.begin()) {
+ --MBBI;
+ MachineInstr &MI = *MBBI;
+ auto [MultiVecOpc, N] = getSVEMultiVecSpillFillInfo(MI);
+ if (MultiVecOpc != 0) {
+ // Save iterator to instruction after MI so we can find inserted
+ // instructions after MI is erased.
+ auto AfterMI = std::next(MachineBasicBlock::iterator(MBBI));
+ if (tryExpandSVESpillFillToMultiVec(MBB, MBBI, MultiVecOpc, N,
+ LiveRegs)) {
+ Modified = true;
+ // MI was erased. Two new instructions (PTRUE_C_B + multi-vec)
+ // were inserted before AfterMI. Step liveness backward over them.
+ auto It = AfterMI;
+ assert(It != MBB.begin());
+ --It; // multi-vec instruction
+ LiveRegs.stepBackward(*It);
+ assert(It != MBB.begin());
+ --It; // PTRUE_C_B
+ LiveRegs.stepBackward(*It);
+ MBBI = It; // Loop will --MBBI to get instruction before PTRUE_C_B
+ continue;
+ }
+ }
+ // Step liveness backward past this instruction.
+ LiveRegs.stepBackward(MI);
+ }
+ return Modified;
+}
+
/// Iterate over the instructions in basic block MBB and expand any
/// pseudo instructions. Return true if anything was modified.
bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
bool Modified = false;
+ // Reset cached scratch PNR at the start of each block.
+ CachedScratchPNR = AArch64::NoRegister;
+
+ Modified |= expandSVEMultiVecSpillFills(MBB);
+
MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
while (MBBI != E) {
MachineBasicBlock::iterator NMBBI = std::next(MBBI);
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index 0e62c7f22341..fa54d66fd2dd 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s --check-prefixes=STRIDED
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -aarch64-enable-multivec-spill-fill=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=STRIDED
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -aarch64-enable-multivec-spill-fill=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+sme2 -aarch64-enable-multivec-spill-fill=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS
define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
; CHECK-LABEL: ld1_x2_i8_z0_z8:
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
index 1cec418249d4..d0a35471e884 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s --check-prefixes=STRIDED
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -aarch64-enable-multivec-spill-fill=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=STRIDED
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -aarch64-enable-multivec-spill-fill=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+sme2 -aarch64-enable-multivec-spill-fill=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS
define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
; STRIDED-LABEL: ldnt1_x2_i8_z0_z8:
diff --git a/llvm/test/CodeGen/AArch64/sve-expand-multivec-spill-fill.mir b/llvm/test/CodeGen/AArch64/sve-expand-multivec-spill-fill.mir
new file mode 100644
index 000000000000..c45e15e9d740
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-expand-multivec-spill-fill.mir
@@ -0,0 +1,534 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -run-pass=aarch64-expand-pseudo \
+# RUN: -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=SVE2P1
+# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -run-pass=aarch64-expand-pseudo \
+# RUN: -force-streaming -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=SVE2P1
+# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -run-pass=aarch64-expand-pseudo \
+# RUN: -aarch64-enable-multivec-spill-fill=false -verify-machineinstrs %s -o - \
+# RUN: | FileCheck %s --check-prefix=NOPT
+# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -run-pass=aarch64-expand-pseudo \
+# RUN: -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=NOPT
+---
+name: store_x2_contiguous
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z0_z1
+
+ ; SVE2P1-LABEL: name: store_x2_contiguous
+ ; SVE2P1: liveins: $z0_z1
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg
+ ; SVE2P1-NEXT: ST1B_2Z_IMM $z0_z1, killed $pn8, $sp, 0
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: store_x2_contiguous
+ ; NOPT: liveins: $z0_z1
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z0, $sp, 0
+ ; NOPT-NEXT: STR_ZXI $z1, $sp, 1
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZXI $z0_z1, $sp, 0
+ RET_ReallyLR
+...
+---
+name: load_x2_contiguous
+tracksRegLiveness: true
+body: |
+ bb.0:
+
+ ; SVE2P1-LABEL: name: load_x2_contiguous
+ ; SVE2P1: $pn8 = PTRUE_C_B implicit $vg
+ ; SVE2P1-NEXT: $z0_z1 = LD1B_2Z_IMM killed $pn8, $sp, 0
+ ; SVE2P1-NEXT: RET undef $lr, implicit $z0_z1
+ ;
+ ; NOPT-LABEL: name: load_x2_contiguous
+ ; NOPT: $z0 = LDR_ZXI $sp, 0
+ ; NOPT-NEXT: $z1 = LDR_ZXI $sp, 1
+ ; NOPT-NEXT: RET undef $lr, implicit $z0_z1
+ $z0_z1 = LDR_ZZXI $sp, 0
+ RET_ReallyLR implicit $z0_z1
+...
+---
+name: store_x4_contiguous
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z0_z1_z2_z3
+
+ ; SVE2P1-LABEL: name: store_x4_contiguous
+ ; SVE2P1: liveins: $z0_z1_z2_z3
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg
+ ; SVE2P1-NEXT: ST1B_4Z_IMM $z0_z1_z2_z3, killed $pn8, $sp, 0
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: store_x4_contiguous
+ ; NOPT: liveins: $z0_z1_z2_z3
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z0, $sp, 0
+ ; NOPT-NEXT: STR_ZXI $z1, $sp, 1
+ ; NOPT-NEXT: STR_ZXI $z2, $sp, 2
+ ; NOPT-NEXT: STR_ZXI $z3, $sp, 3
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZZZXI $z0_z1_z2_z3, $sp, 0
+ RET_ReallyLR
+...
+---
+name: load_x4_contiguous
+tracksRegLiveness: true
+body: |
+ bb.0:
+
+ ; SVE2P1-LABEL: name: load_x4_contiguous
+ ; SVE2P1: $pn8 = PTRUE_C_B implicit $vg
+ ; SVE2P1-NEXT: $z0_z1_z2_z3 = LD1B_4Z_IMM killed $pn8, $sp, 0
+ ; SVE2P1-NEXT: RET undef $lr, implicit $z0_z1_z2_z3
+ ;
+ ; NOPT-LABEL: name: load_x4_contiguous
+ ; NOPT: $z0 = LDR_ZXI $sp, 0
+ ; NOPT-NEXT: $z1 = LDR_ZXI $sp, 1
+ ; NOPT-NEXT: $z2 = LDR_ZXI $sp, 2
+ ; NOPT-NEXT: $z3 = LDR_ZXI $sp, 3
+ ; NOPT-NEXT: RET undef $lr, implicit $z0_z1_z2_z3
+ $z0_z1_z2_z3 = LDR_ZZZZXI $sp, 0
+ RET_ReallyLR implicit $z0_z1_z2_z3
+...
+---
+name: store_x2_strided_contiguous
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z0_z1
+
+ ; SVE2P1-LABEL: name: store_x2_strided_contiguous
+ ; SVE2P1: liveins: $z0_z1
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg
+ ; SVE2P1-NEXT: ST1B_2Z_IMM $z0_z1, killed $pn8, $sp, 0
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: store_x2_strided_contiguous
+ ; NOPT: liveins: $z0_z1
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z0, $sp, 0
+ ; NOPT-NEXT: STR_ZXI $z1, $sp, 1
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZXI_STRIDED_CONTIGUOUS $z0_z1, $sp, 0
+ RET_ReallyLR
+...
+---
+name: store_x4_strided_contiguous
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z4_z5_z6_z7
+
+ ; SVE2P1-LABEL: name: store_x4_strided_contiguous
+ ; SVE2P1: liveins: $z4_z5_z6_z7
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg
+ ; SVE2P1-NEXT: ST1B_4Z_IMM $z4_z5_z6_z7, killed $pn8, $sp, 0
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: store_x4_strided_contiguous
+ ; NOPT: liveins: $z4_z5_z6_z7
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z4, $sp, 0
+ ; NOPT-NEXT: STR_ZXI $z5, $sp, 1
+ ; NOPT-NEXT: STR_ZXI $z6, $sp, 2
+ ; NOPT-NEXT: STR_ZXI $z7, $sp, 3
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZZZXI_STRIDED_CONTIGUOUS $z4_z5_z6_z7, $sp, 0
+ RET_ReallyLR
+...
+---
+name: store_x2_noncontiguous
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z0_z8
+
+ ; SVE2P1-LABEL: name: store_x2_noncontiguous
+ ; SVE2P1: liveins: $z0_z8
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: STR_ZXI $z0, $sp, 0
+ ; SVE2P1-NEXT: STR_ZXI $z8, $sp, 1
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: store_x2_noncontiguous
+ ; NOPT: liveins: $z0_z8
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z0, $sp, 0
+ ; NOPT-NEXT: STR_ZXI $z8, $sp, 1
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZXI_STRIDED_CONTIGUOUS $z0_z8, $sp, 0
+ RET_ReallyLR
+...
+---
+name: store_x2_odd_aligned
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z1_z2
+
+ ; SVE2P1-LABEL: name: store_x2_odd_aligned
+ ; SVE2P1: liveins: $z1_z2
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: STR_ZXI $z1, $sp, 0
+ ; SVE2P1-NEXT: STR_ZXI $z2, $sp, 1
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: store_x2_odd_aligned
+ ; NOPT: liveins: $z1_z2
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z1, $sp, 0
+ ; NOPT-NEXT: STR_ZXI $z2, $sp, 1
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZXI $z1_z2, $sp, 0
+ RET_ReallyLR
+...
+---
+name: store_x4_not_4_aligned
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z2_z3_z4_z5
+
+ ; SVE2P1-LABEL: name: store_x4_not_4_aligned
+ ; SVE2P1: liveins: $z2_z3_z4_z5
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: STR_ZXI $z2, $sp, 0
+ ; SVE2P1-NEXT: STR_ZXI $z3, $sp, 1
+ ; SVE2P1-NEXT: STR_ZXI $z4, $sp, 2
+ ; SVE2P1-NEXT: STR_ZXI $z5, $sp, 3
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: store_x4_not_4_aligned
+ ; NOPT: liveins: $z2_z3_z4_z5
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z2, $sp, 0
+ ; NOPT-NEXT: STR_ZXI $z3, $sp, 1
+ ; NOPT-NEXT: STR_ZXI $z4, $sp, 2
+ ; NOPT-NEXT: STR_ZXI $z5, $sp, 3
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZZZXI $z2_z3_z4_z5, $sp, 0
+ RET_ReallyLR
+...
+---
+name: store_x3_no_multivec
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z0_z1_z2
+
+ ; SVE2P1-LABEL: name: store_x3_no_multivec
+ ; SVE2P1: liveins: $z0_z1_z2
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: STR_ZXI $z0, $sp, 0
+ ; SVE2P1-NEXT: STR_ZXI $z1, $sp, 1
+ ; SVE2P1-NEXT: STR_ZXI $z2, $sp, 2
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: store_x3_no_multivec
+ ; NOPT: liveins: $z0_z1_z2
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z0, $sp, 0
+ ; NOPT-NEXT: STR_ZXI $z1, $sp, 1
+ ; NOPT-NEXT: STR_ZXI $z2, $sp, 2
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZZXI $z0_z1_z2, $sp, 0
+ RET_ReallyLR
+...
+---
+name: store_x2_max_offset
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z0_z1
+
+ ; SVE2P1-LABEL: name: store_x2_max_offset
+ ; SVE2P1: liveins: $z0_z1
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg
+ ; SVE2P1-NEXT: ST1B_2Z_IMM $z0_z1, killed $pn8, $sp, 7
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: store_x2_max_offset
+ ; NOPT: liveins: $z0_z1
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z0, $sp, 14
+ ; NOPT-NEXT: STR_ZXI $z1, $sp, 15
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZXI $z0_z1, $sp, 14
+ RET_ReallyLR
+...
+---
+name: store_x2_min_offset
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z0_z1
+
+ ; SVE2P1-LABEL: name: store_x2_min_offset
+ ; SVE2P1: liveins: $z0_z1
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg
+ ; SVE2P1-NEXT: ST1B_2Z_IMM $z0_z1, killed $pn8, $sp, -8
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: store_x2_min_offset
+ ; NOPT: liveins: $z0_z1
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z0, $sp, -16
+ ; NOPT-NEXT: STR_ZXI $z1, $sp, -15
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZXI $z0_z1, $sp, -16
+ RET_ReallyLR
+...
+---
+name: store_x4_max_offset
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z0_z1_z2_z3
+
+ ; SVE2P1-LABEL: name: store_x4_max_offset
+ ; SVE2P1: liveins: $z0_z1_z2_z3
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg
+ ; SVE2P1-NEXT: ST1B_4Z_IMM $z0_z1_z2_z3, killed $pn8, $sp, 7
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: store_x4_max_offset
+ ; NOPT: liveins: $z0_z1_z2_z3
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z0, $sp, 28
+ ; NOPT-NEXT: STR_ZXI $z1, $sp, 29
+ ; NOPT-NEXT: STR_ZXI $z2, $sp, 30
+ ; NOPT-NEXT: STR_ZXI $z3, $sp, 31
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZZZXI $z0_z1_z2_z3, $sp, 28
+ RET_ReallyLR
+...
+---
+name: store_x4_min_offset
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z0_z1_z2_z3
+
+ ; SVE2P1-LABEL: name: store_x4_min_offset
+ ; SVE2P1: liveins: $z0_z1_z2_z3
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg
+ ; SVE2P1-NEXT: ST1B_4Z_IMM $z0_z1_z2_z3, killed $pn8, $sp, -8
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: store_x4_min_offset
+ ; NOPT: liveins: $z0_z1_z2_z3
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z0, $sp, -32
+ ; NOPT-NEXT: STR_ZXI $z1, $sp, -31
+ ; NOPT-NEXT: STR_ZXI $z2, $sp, -30
+ ; NOPT-NEXT: STR_ZXI $z3, $sp, -29
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZZZXI $z0_z1_z2_z3, $sp, -32
+ RET_ReallyLR
+...
+---
+name: store_x2_offset_out_of_range
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z0_z1
+
+ ; SVE2P1-LABEL: name: store_x2_offset_out_of_range
+ ; SVE2P1: liveins: $z0_z1
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: STR_ZXI $z0, $sp, 16
+ ; SVE2P1-NEXT: STR_ZXI $z1, $sp, 17
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: store_x2_offset_out_of_range
+ ; NOPT: liveins: $z0_z1
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z0, $sp, 16
+ ; NOPT-NEXT: STR_ZXI $z1, $sp, 17
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZXI $z0_z1, $sp, 16
+ RET_ReallyLR
+...
+---
+name: store_x2_offset_not_scaled
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z0_z1
+
+ ; SVE2P1-LABEL: name: store_x2_offset_not_scaled
+ ; SVE2P1: liveins: $z0_z1
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: STR_ZXI $z0, $sp, 3
+ ; SVE2P1-NEXT: STR_ZXI $z1, $sp, 4
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: store_x2_offset_not_scaled
+ ; NOPT: liveins: $z0_z1
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z0, $sp, 3
+ ; NOPT-NEXT: STR_ZXI $z1, $sp, 4
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZXI $z0_z1, $sp, 3
+ RET_ReallyLR
+...
+---
+name: store_x4_offset_out_of_range
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z0_z1_z2_z3
+
+ ; SVE2P1-LABEL: name: store_x4_offset_out_of_range
+ ; SVE2P1: liveins: $z0_z1_z2_z3
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: STR_ZXI $z0, $sp, 32
+ ; SVE2P1-NEXT: STR_ZXI $z1, $sp, 33
+ ; SVE2P1-NEXT: STR_ZXI $z2, $sp, 34
+ ; SVE2P1-NEXT: STR_ZXI $z3, $sp, 35
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: store_x4_offset_out_of_range
+ ; NOPT: liveins: $z0_z1_z2_z3
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z0, $sp, 32
+ ; NOPT-NEXT: STR_ZXI $z1, $sp, 33
+ ; NOPT-NEXT: STR_ZXI $z2, $sp, 34
+ ; NOPT-NEXT: STR_ZXI $z3, $sp, 35
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZZZXI $z0_z1_z2_z3, $sp, 32
+ RET_ReallyLR
+...
+---
+name: store_x4_offset_not_scaled
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z0_z1_z2_z3
+
+ ; SVE2P1-LABEL: name: store_x4_offset_not_scaled
+ ; SVE2P1: liveins: $z0_z1_z2_z3
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: STR_ZXI $z0, $sp, 2
+ ; SVE2P1-NEXT: STR_ZXI $z1, $sp, 3
+ ; SVE2P1-NEXT: STR_ZXI $z2, $sp, 4
+ ; SVE2P1-NEXT: STR_ZXI $z3, $sp, 5
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: store_x4_offset_not_scaled
+ ; NOPT: liveins: $z0_z1_z2_z3
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z0, $sp, 2
+ ; NOPT-NEXT: STR_ZXI $z1, $sp, 3
+ ; NOPT-NEXT: STR_ZXI $z2, $sp, 4
+ ; NOPT-NEXT: STR_ZXI $z3, $sp, 5
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZZZXI $z0_z1_z2_z3, $sp, 2
+ RET_ReallyLR
+...
+---
+name: store_x2_all_pnr_live
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z0_z1, $pn8, $pn9, $pn10, $pn11, $pn12, $pn13, $pn14, $pn15
+
+ ; SVE2P1-LABEL: name: store_x2_all_pnr_live
+ ; SVE2P1: liveins: $z0_z1, $pn8, $pn9, $pn10, $pn11, $pn12, $pn13, $pn14, $pn15
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: STR_ZXI $z0, $sp, 0
+ ; SVE2P1-NEXT: STR_ZXI $z1, $sp, 1
+ ; SVE2P1-NEXT: RET undef $lr, implicit $pn8, implicit $pn9, implicit $pn10, implicit $pn11, implicit $pn12, implicit $pn13, implicit $pn14, implicit $pn15
+ ;
+ ; NOPT-LABEL: name: store_x2_all_pnr_live
+ ; NOPT: liveins: $z0_z1, $pn8, $pn9, $pn10, $pn11, $pn12, $pn13, $pn14, $pn15
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z0, $sp, 0
+ ; NOPT-NEXT: STR_ZXI $z1, $sp, 1
+ ; NOPT-NEXT: RET undef $lr, implicit $pn8, implicit $pn9, implicit $pn10, implicit $pn11, implicit $pn12, implicit $pn13, implicit $pn14, implicit $pn15
+ STR_ZZXI $z0_z1, $sp, 0
+ RET_ReallyLR implicit $pn8, implicit $pn9, implicit $pn10, implicit $pn11, implicit $pn12, implicit $pn13, implicit $pn14, implicit $pn15
+...
+---
+name: store_x2_higher_regs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z20_z21
+
+ ; SVE2P1-LABEL: name: store_x2_higher_regs
+ ; SVE2P1: liveins: $z20_z21
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg
+ ; SVE2P1-NEXT: ST1B_2Z_IMM $z20_z21, killed $pn8, $sp, 0
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: store_x2_higher_regs
+ ; NOPT: liveins: $z20_z21
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z20, $sp, 0
+ ; NOPT-NEXT: STR_ZXI $z21, $sp, 1
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZXI $z20_z21, $sp, 0
+ RET_ReallyLR
+...
+---
+name: load_x4_higher_regs
+tracksRegLiveness: true
+body: |
+ bb.0:
+
+ ; SVE2P1-LABEL: name: load_x4_higher_regs
+ ; SVE2P1: $pn8 = PTRUE_C_B implicit $vg
+ ; SVE2P1-NEXT: $z8_z9_z10_z11 = LD1B_4Z_IMM killed $pn8, $sp, 0
+ ; SVE2P1-NEXT: RET undef $lr, implicit $z8_z9_z10_z11
+ ;
+ ; NOPT-LABEL: name: load_x4_higher_regs
+ ; NOPT: $z8 = LDR_ZXI $sp, 0
+ ; NOPT-NEXT: $z9 = LDR_ZXI $sp, 1
+ ; NOPT-NEXT: $z10 = LDR_ZXI $sp, 2
+ ; NOPT-NEXT: $z11 = LDR_ZXI $sp, 3
+ ; NOPT-NEXT: RET undef $lr, implicit $z8_z9_z10_z11
+ $z8_z9_z10_z11 = LDR_ZZZZXI $sp, 0
+ RET_ReallyLR implicit $z8_z9_z10_z11
+...
+---
+name: multiple_spills_in_block
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $z0_z1, $z4_z5_z6_z7
+
+ ; SVE2P1-LABEL: name: multiple_spills_in_block
+ ; SVE2P1: liveins: $z0_z1, $z4_z5_z6_z7
+ ; SVE2P1-NEXT: {{ $}}
+ ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg
+ ; SVE2P1-NEXT: ST1B_2Z_IMM $z0_z1, killed $pn8, $sp, 0
+ ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg
+ ; SVE2P1-NEXT: ST1B_4Z_IMM $z4_z5_z6_z7, killed $pn8, $sp, 1
+ ; SVE2P1-NEXT: RET undef $lr
+ ;
+ ; NOPT-LABEL: name: multiple_spills_in_block
+ ; NOPT: liveins: $z0_z1, $z4_z5_z6_z7
+ ; NOPT-NEXT: {{ $}}
+ ; NOPT-NEXT: STR_ZXI $z0, $sp, 0
+ ; NOPT-NEXT: STR_ZXI $z1, $sp, 1
+ ; NOPT-NEXT: STR_ZXI $z4, $sp, 4
+ ; NOPT-NEXT: STR_ZXI $z5, $sp, 5
+ ; NOPT-NEXT: STR_ZXI $z6, $sp, 6
+ ; NOPT-NEXT: STR_ZXI $z7, $sp, 7
+ ; NOPT-NEXT: RET undef $lr
+ STR_ZZXI $z0_z1, $sp, 0
+ STR_ZZZZXI $z4_z5_z6_z7, $sp, 4
+ RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/sve-multivec-spill-fill.ll b/llvm/test/CodeGen/AArch64/sve-multivec-spill-fill.ll
new file mode 100644
index 000000000000..5b66ecb4d247
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-multivec-spill-fill.ll
@@ -0,0 +1,351 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefix=SVE2P1
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming < %s | FileCheck %s --check-prefix=SME2
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -aarch64-enable-multivec-spill-fill=false < %s | FileCheck %s --check-prefix=NOPT
+
+; Test that SVE tuple spill/fill pseudo instructions are expanded into
+; multi-vector ld1b/st1b instructions when SVE2.1 or SME2 is available.
+
+define <vscale x 32 x i8> @spill_fill_x2(<vscale x 16 x i8> %unused, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; SVE2P1-LABEL: spill_fill_x2:
+; SVE2P1: // %bb.0:
+; SVE2P1-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; SVE2P1-NEXT: addvl sp, sp, #-17
+; SVE2P1-NEXT: str p8, [sp, #7, mul vl] // 2-byte Spill
+; SVE2P1-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: addvl sp, sp, #-2
+; SVE2P1-NEXT: mov p8.b, p0.b
+; SVE2P1-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0]
+; SVE2P1-NEXT: ptrue pn8.b
+; SVE2P1-NEXT: st1b { z0.b, z1.b }, pn8, [sp]
+; SVE2P1-NEXT: //APP
+; SVE2P1-NEXT: nop
+; SVE2P1-NEXT: //NO_APP
+; SVE2P1-NEXT: ptrue pn8.b
+; SVE2P1-NEXT: ld1b { z0.b, z1.b }, pn8/z, [sp]
+; SVE2P1-NEXT: addvl sp, sp, #2
+; SVE2P1-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Reload
+; SVE2P1-NEXT: addvl sp, sp, #17
+; SVE2P1-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; SVE2P1-NEXT: ret
+;
+; SME2-LABEL: spill_fill_x2:
+; SME2: // %bb.0:
+; SME2-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; SME2-NEXT: addvl sp, sp, #-17
+; SME2-NEXT: str p8, [sp, #7, mul vl] // 2-byte Spill
+; SME2-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: addvl sp, sp, #-2
+; SME2-NEXT: mov p8.b, p0.b
+; SME2-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0]
+; SME2-NEXT: ptrue pn8.b
+; SME2-NEXT: st1b { z0.b, z1.b }, pn8, [sp]
+; SME2-NEXT: //APP
+; SME2-NEXT: nop
+; SME2-NEXT: //NO_APP
+; SME2-NEXT: ptrue pn8.b
+; SME2-NEXT: ld1b { z0.b, z1.b }, pn8/z, [sp]
+; SME2-NEXT: addvl sp, sp, #2
+; SME2-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Reload
+; SME2-NEXT: addvl sp, sp, #17
+; SME2-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; SME2-NEXT: ret
+;
+; NOPT-LABEL: spill_fill_x2:
+; NOPT: // %bb.0:
+; NOPT-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; NOPT-NEXT: addvl sp, sp, #-17
+; NOPT-NEXT: str p8, [sp, #7, mul vl] // 2-byte Spill
+; NOPT-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: addvl sp, sp, #-2
+; NOPT-NEXT: mov p8.b, p0.b
+; NOPT-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0]
+; NOPT-NEXT: str z0, [sp]
+; NOPT-NEXT: str z1, [sp, #1, mul vl]
+; NOPT-NEXT: //APP
+; NOPT-NEXT: nop
+; NOPT-NEXT: //NO_APP
+; NOPT-NEXT: ldr z0, [sp]
+; NOPT-NEXT: ldr z1, [sp, #1, mul vl]
+; NOPT-NEXT: addvl sp, sp, #2
+; NOPT-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Reload
+; NOPT-NEXT: addvl sp, sp, #17
+; NOPT-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; NOPT-NEXT: ret
+ %ld = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr)
+ %v0 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %ld, 0
+ %v1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %ld, 1
+ call void asm sideeffect "nop", "~{z0},~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
+ %tuple = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> %v0, i64 0)
+ %tuple2 = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> %tuple, <vscale x 16 x i8> %v1, i64 16)
+ ret <vscale x 32 x i8> %tuple2
+}
+
+define <vscale x 64 x i8> @spill_fill_x4(<vscale x 16 x i8> %unused, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; SVE2P1-LABEL: spill_fill_x4:
+; SVE2P1: // %bb.0:
+; SVE2P1-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; SVE2P1-NEXT: addvl sp, sp, #-17
+; SVE2P1-NEXT: str p8, [sp, #7, mul vl] // 2-byte Spill
+; SVE2P1-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
+; SVE2P1-NEXT: addvl sp, sp, #-4
+; SVE2P1-NEXT: mov p8.b, p0.b
+; SVE2P1-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
+; SVE2P1-NEXT: ptrue pn8.b
+; SVE2P1-NEXT: st1b { z0.b - z3.b }, pn8, [sp]
+; SVE2P1-NEXT: //APP
+; SVE2P1-NEXT: nop
+; SVE2P1-NEXT: //NO_APP
+; SVE2P1-NEXT: ptrue pn8.b
+; SVE2P1-NEXT: ld1b { z0.b - z3.b }, pn8/z, [sp]
+; SVE2P1-NEXT: addvl sp, sp, #4
+; SVE2P1-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; SVE2P1-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Reload
+; SVE2P1-NEXT: addvl sp, sp, #17
+; SVE2P1-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; SVE2P1-NEXT: ret
+;
+; SME2-LABEL: spill_fill_x4:
+; SME2: // %bb.0:
+; SME2-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; SME2-NEXT: addvl sp, sp, #-17
+; SME2-NEXT: str p8, [sp, #7, mul vl] // 2-byte Spill
+; SME2-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
+; SME2-NEXT: addvl sp, sp, #-4
+; SME2-NEXT: mov p8.b, p0.b
+; SME2-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
+; SME2-NEXT: ptrue pn8.b
+; SME2-NEXT: st1b { z0.b - z3.b }, pn8, [sp]
+; SME2-NEXT: //APP
+; SME2-NEXT: nop
+; SME2-NEXT: //NO_APP
+; SME2-NEXT: ptrue pn8.b
+; SME2-NEXT: ld1b { z0.b - z3.b }, pn8/z, [sp]
+; SME2-NEXT: addvl sp, sp, #4
+; SME2-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; SME2-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Reload
+; SME2-NEXT: addvl sp, sp, #17
+; SME2-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; SME2-NEXT: ret
+;
+; NOPT-LABEL: spill_fill_x4:
+; NOPT: // %bb.0:
+; NOPT-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; NOPT-NEXT: addvl sp, sp, #-17
+; NOPT-NEXT: str p8, [sp, #7, mul vl] // 2-byte Spill
+; NOPT-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
+; NOPT-NEXT: addvl sp, sp, #-4
+; NOPT-NEXT: mov p8.b, p0.b
+; NOPT-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
+; NOPT-NEXT: str z0, [sp]
+; NOPT-NEXT: str z1, [sp, #1, mul vl]
+; NOPT-NEXT: str z2, [sp, #2, mul vl]
+; NOPT-NEXT: str z3, [sp, #3, mul vl]
+; NOPT-NEXT: //APP
+; NOPT-NEXT: nop
+; NOPT-NEXT: //NO_APP
+; NOPT-NEXT: ldr z0, [sp]
+; NOPT-NEXT: ldr z1, [sp, #1, mul vl]
+; NOPT-NEXT: ldr z2, [sp, #2, mul vl]
+; NOPT-NEXT: ldr z3, [sp, #3, mul vl]
+; NOPT-NEXT: addvl sp, sp, #4
+; NOPT-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; NOPT-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Reload
+; NOPT-NEXT: addvl sp, sp, #17
+; NOPT-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; NOPT-NEXT: ret
+ %ld = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr)
+ %v0 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %ld, 0
+ %v1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %ld, 1
+ %v2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %ld, 2
+ %v3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %ld, 3
+ call void asm sideeffect "nop", "~{z0},~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
+ %t0 = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> %v0, i64 0)
+ %t1 = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> %t0, <vscale x 16 x i8> %v1, i64 16)
+ %t2 = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> %t1, <vscale x 16 x i8> %v2, i64 32)
+ %t3 = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> %t2, <vscale x 16 x i8> %v3, i64 48)
+ ret <vscale x 64 x i8> %t3
+}