diff options
| author | Amara Emerson <amara@apple.com> | 2026-03-17 13:25:40 -0700 |
|---|---|---|
| committer | Amara Emerson <amara@apple.com> | 2026-03-18 15:37:29 -0700 |
| commit | 6ae8c3dee3d367cde335c235bc1ebc3a2046929b (patch) | |
| tree | d8b5722abfb3e6a33852da4badb7c20e9b879349 | |
| parent | 9da068b7cfa269d736d5261b8c82b701a718e531 (diff) | |
| download | llvm-users/amara/expandpseudo-multivec.tar.gz llvm-users/amara/expandpseudo-multivec.tar.bz2 llvm-users/amara/expandpseudo-multivec.zip | |
[AArch64][SVE] Use multi-vector spill/fill instructions in pseudo expansionusers/amara/expandpseudo-multivec
SVE tuple spill/fill pseudos (STR_ZZXI, LDR_ZZZZXI, etc.) currently expand into
N individual STR_ZXI/LDR_ZXI instructions. On targets with SVE2.1 or SME2 we can
directly generate a ptrue + multi-vec op. For the x2 case this is neutral in
code size but can still be beneficial depending on your uarch. I'm going to
assume this is a generally good thing for now.
rdar://168475826
5 files changed, 1058 insertions, 6 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 4b3353e54797..23b1c5e1fb79 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -33,6 +33,7 @@ #include "llvm/MC/MCInstrDesc.h" #include "llvm/Pass.h" #include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetMachine.h" #include "llvm/TargetParser/Triple.h" #include <cassert> @@ -41,6 +42,10 @@ using namespace llvm; +static cl::opt<bool> EnableMultiVecSpillFill( + "aarch64-enable-multivec-spill-fill", cl::init(true), cl::Hidden, + cl::desc("Enable multi-vector spill/fill expansion for SVE")); + #define AARCH64_EXPAND_PSEUDO_NAME "AArch64 pseudo instruction expansion pass" namespace { @@ -90,6 +95,11 @@ private: bool expandSVESpillFill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned Opc, unsigned N); + bool tryExpandSVESpillFillToMultiVec(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned MultiVecOpc, unsigned N, + LivePhysRegs &LiveRegs); + bool expandSVEMultiVecSpillFills(MachineBasicBlock &MBB); bool expandCALL_RVMARKER(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); bool expandCALL_BTI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); @@ -111,6 +121,10 @@ private: MachineBasicBlock::iterator MBBI); MachineBasicBlock *expandCondSMToggle(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); + + // Cached scratch PNR register for multi-vector spill/fill optimization. + // Reset at the start of each block. + Register CachedScratchPNR = AArch64::NoRegister; }; } // end anonymous namespace @@ -859,6 +873,74 @@ bool AArch64ExpandPseudo::expandSetTagLoop( return true; } +/// Try to use multi-vector instructions to replace the tuple load/stores, +/// returns true if successful. +bool AArch64ExpandPseudo::tryExpandSVESpillFillToMultiVec( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + unsigned MultiVecOpc, unsigned N, LivePhysRegs &LiveRegs) { + MachineFunction &MF = *MBB.getParent(); + const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); + const TargetRegisterInfo *TRI = STI.getRegisterInfo(); + MachineInstr &MI = *MBBI; + Register TupleReg = MI.getOperand(0).getReg(); + Register BaseReg = MI.getOperand(1).getReg(); + int BaseOffset = MI.getOperand(2).getImm(); + DebugLoc DL = MI.getDebugLoc(); + + // Check immediate range for multi-vector instructions. + // simm4s2 for x2: -8 to +7 (scaled by 2) + // simm4s4 for x4: -8 to +7 (scaled by 4) + int Scale = N == 4 ? 4 : 2; + int ScaledOffset = BaseOffset / Scale; + if ((BaseOffset % Scale != 0) || ScaledOffset < -8 || ScaledOffset > 7) + return false; + + // Find a scratch PNR register that is not live at this point. + // First check if our cached register is still usable. + Register ScratchPNR = CachedScratchPNR; + + if (ScratchPNR == AArch64::NoRegister || LiveRegs.contains(ScratchPNR)) { + ScratchPNR = AArch64::NoRegister; + for (MCPhysReg Reg : + {AArch64::PN8, AArch64::PN9, AArch64::PN10, AArch64::PN11, + AArch64::PN12, AArch64::PN13, AArch64::PN14, AArch64::PN15}) { + if (!LiveRegs.contains(Reg)) { + ScratchPNR = Reg; + break; + } + } + if (ScratchPNR == AArch64::NoRegister) + return false; // No available scratch register, bail out. + CachedScratchPNR = ScratchPNR; + } + + // Get the register tuple for the multi-vector instruction. + Register FirstReg = TRI->getSubReg(TupleReg, AArch64::zsub0); + unsigned RegNum = FirstReg - AArch64::Z0; + Register MultiVecTupleReg = + (N == 4) ? AArch64::Z0_Z1_Z2_Z3 + RegNum : AArch64::Z0_Z1 + RegNum; + + bool IsLoad = (MultiVecOpc == AArch64::LD1B_2Z_IMM || + MultiVecOpc == AArch64::LD1B_4Z_IMM); + bool Kill = MI.getOperand(1).isKill(); + + // Insert PTRUE_C_B to set up the predicate. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::PTRUE_C_B), ScratchPNR); + + // Build the multi-vector instruction. + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(MultiVecOpc)); + if (IsLoad) + MIB.addReg(MultiVecTupleReg, RegState::Define); + else + MIB.addReg(MultiVecTupleReg); + MIB.addReg(ScratchPNR, RegState::Kill); + MIB.addReg(BaseReg, getKillRegState(Kill)); + MIB.addImm(ScaledOffset); + + MI.eraseFromParent(); + return true; +} + bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned Opc, unsigned N) { @@ -2005,11 +2087,96 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, return false; } +/// Map SVE tuple spill/fill pseudo opcodes to multi-vector opcodes. +/// Returns {MultiVecOpc, N} or {0, 0} if not a candidate. +/// Only ZPR (not PPR) spills with contiguous, properly-aligned registers +/// can use multi-vector instructions. +static std::pair<unsigned, unsigned> +getSVEMultiVecSpillFillInfo(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AArch64::STR_ZZZZXI: + case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS: + if (AArch64::ZPR4Mul4RegClass.contains(MI.getOperand(0).getReg())) + return {AArch64::ST1B_4Z_IMM, 4}; + return {0, 0}; + case AArch64::STR_ZZXI: + case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS: + if (AArch64::ZPR2Mul2RegClass.contains(MI.getOperand(0).getReg())) + return {AArch64::ST1B_2Z_IMM, 2}; + return {0, 0}; + case AArch64::LDR_ZZZZXI: + case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS: + if (AArch64::ZPR4Mul4RegClass.contains(MI.getOperand(0).getReg())) + return {AArch64::LD1B_4Z_IMM, 4}; + return {0, 0}; + case AArch64::LDR_ZZXI: + case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS: + if (AArch64::ZPR2Mul2RegClass.contains(MI.getOperand(0).getReg())) + return {AArch64::LD1B_2Z_IMM, 2}; + return {0, 0}; + default: + return {0, 0}; + } +} + +/// Try to expand SVE spill/fill pseudos into multi-vector instructions. +/// Iterates backward through the block with incremental liveness to find +/// free PNR scratch registers, avoiding O(n^2) liveness recomputation. +bool AArch64ExpandPseudo::expandSVEMultiVecSpillFills(MachineBasicBlock &MBB) { + if (!EnableMultiVecSpillFill || MBB.empty()) + return false; + + MachineFunction &MF = *MBB.getParent(); + const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); + if (!STI.hasSVE2p1() && !(STI.hasSME2() && STI.isStreaming())) + return false; + + bool Modified = false; + const TargetRegisterInfo *TRI = + MBB.getParent()->getSubtarget().getRegisterInfo(); + LivePhysRegs LiveRegs(*TRI); + LiveRegs.addLiveOuts(MBB); + auto MBBI = MBB.end(); + while (MBBI != MBB.begin()) { + --MBBI; + MachineInstr &MI = *MBBI; + auto [MultiVecOpc, N] = getSVEMultiVecSpillFillInfo(MI); + if (MultiVecOpc != 0) { + // Save iterator to instruction after MI so we can find inserted + // instructions after MI is erased. + auto AfterMI = std::next(MachineBasicBlock::iterator(MBBI)); + if (tryExpandSVESpillFillToMultiVec(MBB, MBBI, MultiVecOpc, N, + LiveRegs)) { + Modified = true; + // MI was erased. Two new instructions (PTRUE_C_B + multi-vec) + // were inserted before AfterMI. Step liveness backward over them. + auto It = AfterMI; + assert(It != MBB.begin()); + --It; // multi-vec instruction + LiveRegs.stepBackward(*It); + assert(It != MBB.begin()); + --It; // PTRUE_C_B + LiveRegs.stepBackward(*It); + MBBI = It; // Loop will --MBBI to get instruction before PTRUE_C_B + continue; + } + } + // Step liveness backward past this instruction. + LiveRegs.stepBackward(MI); + } + return Modified; +} + /// Iterate over the instructions in basic block MBB and expand any /// pseudo instructions. Return true if anything was modified. bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) { bool Modified = false; + // Reset cached scratch PNR at the start of each block. + CachedScratchPNR = AArch64::NoRegister; + + Modified |= expandSVEMultiVecSpillFills(MBB); + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); while (MBBI != E) { MachineBasicBlock::iterator NMBBI = std::next(MBBI); diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll index 0e62c7f22341..fa54d66fd2dd 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s --check-prefixes=STRIDED -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -aarch64-enable-multivec-spill-fill=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=STRIDED +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -aarch64-enable-multivec-spill-fill=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+sme2 -aarch64-enable-multivec-spill-fill=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { ; CHECK-LABEL: ld1_x2_i8_z0_z8: diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll index 1cec418249d4..d0a35471e884 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s --check-prefixes=STRIDED -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -aarch64-enable-multivec-spill-fill=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=STRIDED +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -aarch64-enable-multivec-spill-fill=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+sme2 -aarch64-enable-multivec-spill-fill=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { ; STRIDED-LABEL: ldnt1_x2_i8_z0_z8: diff --git a/llvm/test/CodeGen/AArch64/sve-expand-multivec-spill-fill.mir b/llvm/test/CodeGen/AArch64/sve-expand-multivec-spill-fill.mir new file mode 100644 index 000000000000..c45e15e9d740 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-expand-multivec-spill-fill.mir @@ -0,0 +1,534 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -run-pass=aarch64-expand-pseudo \ +# RUN: -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=SVE2P1 +# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -run-pass=aarch64-expand-pseudo \ +# RUN: -force-streaming -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=SVE2P1 +# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -run-pass=aarch64-expand-pseudo \ +# RUN: -aarch64-enable-multivec-spill-fill=false -verify-machineinstrs %s -o - \ +# RUN: | FileCheck %s --check-prefix=NOPT +# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -run-pass=aarch64-expand-pseudo \ +# RUN: -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=NOPT +--- +name: store_x2_contiguous +tracksRegLiveness: true +body: | + bb.0: + liveins: $z0_z1 + + ; SVE2P1-LABEL: name: store_x2_contiguous + ; SVE2P1: liveins: $z0_z1 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg + ; SVE2P1-NEXT: ST1B_2Z_IMM $z0_z1, killed $pn8, $sp, 0 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: store_x2_contiguous + ; NOPT: liveins: $z0_z1 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z0, $sp, 0 + ; NOPT-NEXT: STR_ZXI $z1, $sp, 1 + ; NOPT-NEXT: RET undef $lr + STR_ZZXI $z0_z1, $sp, 0 + RET_ReallyLR +... +--- +name: load_x2_contiguous +tracksRegLiveness: true +body: | + bb.0: + + ; SVE2P1-LABEL: name: load_x2_contiguous + ; SVE2P1: $pn8 = PTRUE_C_B implicit $vg + ; SVE2P1-NEXT: $z0_z1 = LD1B_2Z_IMM killed $pn8, $sp, 0 + ; SVE2P1-NEXT: RET undef $lr, implicit $z0_z1 + ; + ; NOPT-LABEL: name: load_x2_contiguous + ; NOPT: $z0 = LDR_ZXI $sp, 0 + ; NOPT-NEXT: $z1 = LDR_ZXI $sp, 1 + ; NOPT-NEXT: RET undef $lr, implicit $z0_z1 + $z0_z1 = LDR_ZZXI $sp, 0 + RET_ReallyLR implicit $z0_z1 +... +--- +name: store_x4_contiguous +tracksRegLiveness: true +body: | + bb.0: + liveins: $z0_z1_z2_z3 + + ; SVE2P1-LABEL: name: store_x4_contiguous + ; SVE2P1: liveins: $z0_z1_z2_z3 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg + ; SVE2P1-NEXT: ST1B_4Z_IMM $z0_z1_z2_z3, killed $pn8, $sp, 0 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: store_x4_contiguous + ; NOPT: liveins: $z0_z1_z2_z3 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z0, $sp, 0 + ; NOPT-NEXT: STR_ZXI $z1, $sp, 1 + ; NOPT-NEXT: STR_ZXI $z2, $sp, 2 + ; NOPT-NEXT: STR_ZXI $z3, $sp, 3 + ; NOPT-NEXT: RET undef $lr + STR_ZZZZXI $z0_z1_z2_z3, $sp, 0 + RET_ReallyLR +... +--- +name: load_x4_contiguous +tracksRegLiveness: true +body: | + bb.0: + + ; SVE2P1-LABEL: name: load_x4_contiguous + ; SVE2P1: $pn8 = PTRUE_C_B implicit $vg + ; SVE2P1-NEXT: $z0_z1_z2_z3 = LD1B_4Z_IMM killed $pn8, $sp, 0 + ; SVE2P1-NEXT: RET undef $lr, implicit $z0_z1_z2_z3 + ; + ; NOPT-LABEL: name: load_x4_contiguous + ; NOPT: $z0 = LDR_ZXI $sp, 0 + ; NOPT-NEXT: $z1 = LDR_ZXI $sp, 1 + ; NOPT-NEXT: $z2 = LDR_ZXI $sp, 2 + ; NOPT-NEXT: $z3 = LDR_ZXI $sp, 3 + ; NOPT-NEXT: RET undef $lr, implicit $z0_z1_z2_z3 + $z0_z1_z2_z3 = LDR_ZZZZXI $sp, 0 + RET_ReallyLR implicit $z0_z1_z2_z3 +... +--- +name: store_x2_strided_contiguous +tracksRegLiveness: true +body: | + bb.0: + liveins: $z0_z1 + + ; SVE2P1-LABEL: name: store_x2_strided_contiguous + ; SVE2P1: liveins: $z0_z1 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg + ; SVE2P1-NEXT: ST1B_2Z_IMM $z0_z1, killed $pn8, $sp, 0 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: store_x2_strided_contiguous + ; NOPT: liveins: $z0_z1 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z0, $sp, 0 + ; NOPT-NEXT: STR_ZXI $z1, $sp, 1 + ; NOPT-NEXT: RET undef $lr + STR_ZZXI_STRIDED_CONTIGUOUS $z0_z1, $sp, 0 + RET_ReallyLR +... +--- +name: store_x4_strided_contiguous +tracksRegLiveness: true +body: | + bb.0: + liveins: $z4_z5_z6_z7 + + ; SVE2P1-LABEL: name: store_x4_strided_contiguous + ; SVE2P1: liveins: $z4_z5_z6_z7 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg + ; SVE2P1-NEXT: ST1B_4Z_IMM $z4_z5_z6_z7, killed $pn8, $sp, 0 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: store_x4_strided_contiguous + ; NOPT: liveins: $z4_z5_z6_z7 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z4, $sp, 0 + ; NOPT-NEXT: STR_ZXI $z5, $sp, 1 + ; NOPT-NEXT: STR_ZXI $z6, $sp, 2 + ; NOPT-NEXT: STR_ZXI $z7, $sp, 3 + ; NOPT-NEXT: RET undef $lr + STR_ZZZZXI_STRIDED_CONTIGUOUS $z4_z5_z6_z7, $sp, 0 + RET_ReallyLR +... +--- +name: store_x2_noncontiguous +tracksRegLiveness: true +body: | + bb.0: + liveins: $z0_z8 + + ; SVE2P1-LABEL: name: store_x2_noncontiguous + ; SVE2P1: liveins: $z0_z8 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: STR_ZXI $z0, $sp, 0 + ; SVE2P1-NEXT: STR_ZXI $z8, $sp, 1 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: store_x2_noncontiguous + ; NOPT: liveins: $z0_z8 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z0, $sp, 0 + ; NOPT-NEXT: STR_ZXI $z8, $sp, 1 + ; NOPT-NEXT: RET undef $lr + STR_ZZXI_STRIDED_CONTIGUOUS $z0_z8, $sp, 0 + RET_ReallyLR +... +--- +name: store_x2_odd_aligned +tracksRegLiveness: true +body: | + bb.0: + liveins: $z1_z2 + + ; SVE2P1-LABEL: name: store_x2_odd_aligned + ; SVE2P1: liveins: $z1_z2 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: STR_ZXI $z1, $sp, 0 + ; SVE2P1-NEXT: STR_ZXI $z2, $sp, 1 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: store_x2_odd_aligned + ; NOPT: liveins: $z1_z2 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z1, $sp, 0 + ; NOPT-NEXT: STR_ZXI $z2, $sp, 1 + ; NOPT-NEXT: RET undef $lr + STR_ZZXI $z1_z2, $sp, 0 + RET_ReallyLR +... +--- +name: store_x4_not_4_aligned +tracksRegLiveness: true +body: | + bb.0: + liveins: $z2_z3_z4_z5 + + ; SVE2P1-LABEL: name: store_x4_not_4_aligned + ; SVE2P1: liveins: $z2_z3_z4_z5 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: STR_ZXI $z2, $sp, 0 + ; SVE2P1-NEXT: STR_ZXI $z3, $sp, 1 + ; SVE2P1-NEXT: STR_ZXI $z4, $sp, 2 + ; SVE2P1-NEXT: STR_ZXI $z5, $sp, 3 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: store_x4_not_4_aligned + ; NOPT: liveins: $z2_z3_z4_z5 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z2, $sp, 0 + ; NOPT-NEXT: STR_ZXI $z3, $sp, 1 + ; NOPT-NEXT: STR_ZXI $z4, $sp, 2 + ; NOPT-NEXT: STR_ZXI $z5, $sp, 3 + ; NOPT-NEXT: RET undef $lr + STR_ZZZZXI $z2_z3_z4_z5, $sp, 0 + RET_ReallyLR +... +--- +name: store_x3_no_multivec +tracksRegLiveness: true +body: | + bb.0: + liveins: $z0_z1_z2 + + ; SVE2P1-LABEL: name: store_x3_no_multivec + ; SVE2P1: liveins: $z0_z1_z2 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: STR_ZXI $z0, $sp, 0 + ; SVE2P1-NEXT: STR_ZXI $z1, $sp, 1 + ; SVE2P1-NEXT: STR_ZXI $z2, $sp, 2 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: store_x3_no_multivec + ; NOPT: liveins: $z0_z1_z2 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z0, $sp, 0 + ; NOPT-NEXT: STR_ZXI $z1, $sp, 1 + ; NOPT-NEXT: STR_ZXI $z2, $sp, 2 + ; NOPT-NEXT: RET undef $lr + STR_ZZZXI $z0_z1_z2, $sp, 0 + RET_ReallyLR +... +--- +name: store_x2_max_offset +tracksRegLiveness: true +body: | + bb.0: + liveins: $z0_z1 + + ; SVE2P1-LABEL: name: store_x2_max_offset + ; SVE2P1: liveins: $z0_z1 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg + ; SVE2P1-NEXT: ST1B_2Z_IMM $z0_z1, killed $pn8, $sp, 7 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: store_x2_max_offset + ; NOPT: liveins: $z0_z1 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z0, $sp, 14 + ; NOPT-NEXT: STR_ZXI $z1, $sp, 15 + ; NOPT-NEXT: RET undef $lr + STR_ZZXI $z0_z1, $sp, 14 + RET_ReallyLR +... +--- +name: store_x2_min_offset +tracksRegLiveness: true +body: | + bb.0: + liveins: $z0_z1 + + ; SVE2P1-LABEL: name: store_x2_min_offset + ; SVE2P1: liveins: $z0_z1 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg + ; SVE2P1-NEXT: ST1B_2Z_IMM $z0_z1, killed $pn8, $sp, -8 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: store_x2_min_offset + ; NOPT: liveins: $z0_z1 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z0, $sp, -16 + ; NOPT-NEXT: STR_ZXI $z1, $sp, -15 + ; NOPT-NEXT: RET undef $lr + STR_ZZXI $z0_z1, $sp, -16 + RET_ReallyLR +... +--- +name: store_x4_max_offset +tracksRegLiveness: true +body: | + bb.0: + liveins: $z0_z1_z2_z3 + + ; SVE2P1-LABEL: name: store_x4_max_offset + ; SVE2P1: liveins: $z0_z1_z2_z3 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg + ; SVE2P1-NEXT: ST1B_4Z_IMM $z0_z1_z2_z3, killed $pn8, $sp, 7 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: store_x4_max_offset + ; NOPT: liveins: $z0_z1_z2_z3 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z0, $sp, 28 + ; NOPT-NEXT: STR_ZXI $z1, $sp, 29 + ; NOPT-NEXT: STR_ZXI $z2, $sp, 30 + ; NOPT-NEXT: STR_ZXI $z3, $sp, 31 + ; NOPT-NEXT: RET undef $lr + STR_ZZZZXI $z0_z1_z2_z3, $sp, 28 + RET_ReallyLR +... +--- +name: store_x4_min_offset +tracksRegLiveness: true +body: | + bb.0: + liveins: $z0_z1_z2_z3 + + ; SVE2P1-LABEL: name: store_x4_min_offset + ; SVE2P1: liveins: $z0_z1_z2_z3 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg + ; SVE2P1-NEXT: ST1B_4Z_IMM $z0_z1_z2_z3, killed $pn8, $sp, -8 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: store_x4_min_offset + ; NOPT: liveins: $z0_z1_z2_z3 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z0, $sp, -32 + ; NOPT-NEXT: STR_ZXI $z1, $sp, -31 + ; NOPT-NEXT: STR_ZXI $z2, $sp, -30 + ; NOPT-NEXT: STR_ZXI $z3, $sp, -29 + ; NOPT-NEXT: RET undef $lr + STR_ZZZZXI $z0_z1_z2_z3, $sp, -32 + RET_ReallyLR +... +--- +name: store_x2_offset_out_of_range +tracksRegLiveness: true +body: | + bb.0: + liveins: $z0_z1 + + ; SVE2P1-LABEL: name: store_x2_offset_out_of_range + ; SVE2P1: liveins: $z0_z1 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: STR_ZXI $z0, $sp, 16 + ; SVE2P1-NEXT: STR_ZXI $z1, $sp, 17 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: store_x2_offset_out_of_range + ; NOPT: liveins: $z0_z1 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z0, $sp, 16 + ; NOPT-NEXT: STR_ZXI $z1, $sp, 17 + ; NOPT-NEXT: RET undef $lr + STR_ZZXI $z0_z1, $sp, 16 + RET_ReallyLR +... +--- +name: store_x2_offset_not_scaled +tracksRegLiveness: true +body: | + bb.0: + liveins: $z0_z1 + + ; SVE2P1-LABEL: name: store_x2_offset_not_scaled + ; SVE2P1: liveins: $z0_z1 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: STR_ZXI $z0, $sp, 3 + ; SVE2P1-NEXT: STR_ZXI $z1, $sp, 4 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: store_x2_offset_not_scaled + ; NOPT: liveins: $z0_z1 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z0, $sp, 3 + ; NOPT-NEXT: STR_ZXI $z1, $sp, 4 + ; NOPT-NEXT: RET undef $lr + STR_ZZXI $z0_z1, $sp, 3 + RET_ReallyLR +... +--- +name: store_x4_offset_out_of_range +tracksRegLiveness: true +body: | + bb.0: + liveins: $z0_z1_z2_z3 + + ; SVE2P1-LABEL: name: store_x4_offset_out_of_range + ; SVE2P1: liveins: $z0_z1_z2_z3 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: STR_ZXI $z0, $sp, 32 + ; SVE2P1-NEXT: STR_ZXI $z1, $sp, 33 + ; SVE2P1-NEXT: STR_ZXI $z2, $sp, 34 + ; SVE2P1-NEXT: STR_ZXI $z3, $sp, 35 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: store_x4_offset_out_of_range + ; NOPT: liveins: $z0_z1_z2_z3 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z0, $sp, 32 + ; NOPT-NEXT: STR_ZXI $z1, $sp, 33 + ; NOPT-NEXT: STR_ZXI $z2, $sp, 34 + ; NOPT-NEXT: STR_ZXI $z3, $sp, 35 + ; NOPT-NEXT: RET undef $lr + STR_ZZZZXI $z0_z1_z2_z3, $sp, 32 + RET_ReallyLR +... +--- +name: store_x4_offset_not_scaled +tracksRegLiveness: true +body: | + bb.0: + liveins: $z0_z1_z2_z3 + + ; SVE2P1-LABEL: name: store_x4_offset_not_scaled + ; SVE2P1: liveins: $z0_z1_z2_z3 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: STR_ZXI $z0, $sp, 2 + ; SVE2P1-NEXT: STR_ZXI $z1, $sp, 3 + ; SVE2P1-NEXT: STR_ZXI $z2, $sp, 4 + ; SVE2P1-NEXT: STR_ZXI $z3, $sp, 5 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: store_x4_offset_not_scaled + ; NOPT: liveins: $z0_z1_z2_z3 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z0, $sp, 2 + ; NOPT-NEXT: STR_ZXI $z1, $sp, 3 + ; NOPT-NEXT: STR_ZXI $z2, $sp, 4 + ; NOPT-NEXT: STR_ZXI $z3, $sp, 5 + ; NOPT-NEXT: RET undef $lr + STR_ZZZZXI $z0_z1_z2_z3, $sp, 2 + RET_ReallyLR +... +--- +name: store_x2_all_pnr_live +tracksRegLiveness: true +body: | + bb.0: + liveins: $z0_z1, $pn8, $pn9, $pn10, $pn11, $pn12, $pn13, $pn14, $pn15 + + ; SVE2P1-LABEL: name: store_x2_all_pnr_live + ; SVE2P1: liveins: $z0_z1, $pn8, $pn9, $pn10, $pn11, $pn12, $pn13, $pn14, $pn15 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: STR_ZXI $z0, $sp, 0 + ; SVE2P1-NEXT: STR_ZXI $z1, $sp, 1 + ; SVE2P1-NEXT: RET undef $lr, implicit $pn8, implicit $pn9, implicit $pn10, implicit $pn11, implicit $pn12, implicit $pn13, implicit $pn14, implicit $pn15 + ; + ; NOPT-LABEL: name: store_x2_all_pnr_live + ; NOPT: liveins: $z0_z1, $pn8, $pn9, $pn10, $pn11, $pn12, $pn13, $pn14, $pn15 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z0, $sp, 0 + ; NOPT-NEXT: STR_ZXI $z1, $sp, 1 + ; NOPT-NEXT: RET undef $lr, implicit $pn8, implicit $pn9, implicit $pn10, implicit $pn11, implicit $pn12, implicit $pn13, implicit $pn14, implicit $pn15 + STR_ZZXI $z0_z1, $sp, 0 + RET_ReallyLR implicit $pn8, implicit $pn9, implicit $pn10, implicit $pn11, implicit $pn12, implicit $pn13, implicit $pn14, implicit $pn15 +... +--- +name: store_x2_higher_regs +tracksRegLiveness: true +body: | + bb.0: + liveins: $z20_z21 + + ; SVE2P1-LABEL: name: store_x2_higher_regs + ; SVE2P1: liveins: $z20_z21 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg + ; SVE2P1-NEXT: ST1B_2Z_IMM $z20_z21, killed $pn8, $sp, 0 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: store_x2_higher_regs + ; NOPT: liveins: $z20_z21 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z20, $sp, 0 + ; NOPT-NEXT: STR_ZXI $z21, $sp, 1 + ; NOPT-NEXT: RET undef $lr + STR_ZZXI $z20_z21, $sp, 0 + RET_ReallyLR +... +--- +name: load_x4_higher_regs +tracksRegLiveness: true +body: | + bb.0: + + ; SVE2P1-LABEL: name: load_x4_higher_regs + ; SVE2P1: $pn8 = PTRUE_C_B implicit $vg + ; SVE2P1-NEXT: $z8_z9_z10_z11 = LD1B_4Z_IMM killed $pn8, $sp, 0 + ; SVE2P1-NEXT: RET undef $lr, implicit $z8_z9_z10_z11 + ; + ; NOPT-LABEL: name: load_x4_higher_regs + ; NOPT: $z8 = LDR_ZXI $sp, 0 + ; NOPT-NEXT: $z9 = LDR_ZXI $sp, 1 + ; NOPT-NEXT: $z10 = LDR_ZXI $sp, 2 + ; NOPT-NEXT: $z11 = LDR_ZXI $sp, 3 + ; NOPT-NEXT: RET undef $lr, implicit $z8_z9_z10_z11 + $z8_z9_z10_z11 = LDR_ZZZZXI $sp, 0 + RET_ReallyLR implicit $z8_z9_z10_z11 +... +--- +name: multiple_spills_in_block +tracksRegLiveness: true +body: | + bb.0: + liveins: $z0_z1, $z4_z5_z6_z7 + + ; SVE2P1-LABEL: name: multiple_spills_in_block + ; SVE2P1: liveins: $z0_z1, $z4_z5_z6_z7 + ; SVE2P1-NEXT: {{ $}} + ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg + ; SVE2P1-NEXT: ST1B_2Z_IMM $z0_z1, killed $pn8, $sp, 0 + ; SVE2P1-NEXT: $pn8 = PTRUE_C_B implicit $vg + ; SVE2P1-NEXT: ST1B_4Z_IMM $z4_z5_z6_z7, killed $pn8, $sp, 1 + ; SVE2P1-NEXT: RET undef $lr + ; + ; NOPT-LABEL: name: multiple_spills_in_block + ; NOPT: liveins: $z0_z1, $z4_z5_z6_z7 + ; NOPT-NEXT: {{ $}} + ; NOPT-NEXT: STR_ZXI $z0, $sp, 0 + ; NOPT-NEXT: STR_ZXI $z1, $sp, 1 + ; NOPT-NEXT: STR_ZXI $z4, $sp, 4 + ; NOPT-NEXT: STR_ZXI $z5, $sp, 5 + ; NOPT-NEXT: STR_ZXI $z6, $sp, 6 + ; NOPT-NEXT: STR_ZXI $z7, $sp, 7 + ; NOPT-NEXT: RET undef $lr + STR_ZZXI $z0_z1, $sp, 0 + STR_ZZZZXI $z4_z5_z6_z7, $sp, 4 + RET_ReallyLR +... diff --git a/llvm/test/CodeGen/AArch64/sve-multivec-spill-fill.ll b/llvm/test/CodeGen/AArch64/sve-multivec-spill-fill.ll new file mode 100644 index 000000000000..5b66ecb4d247 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-multivec-spill-fill.ll @@ -0,0 +1,351 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefix=SVE2P1 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming < %s | FileCheck %s --check-prefix=SME2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -aarch64-enable-multivec-spill-fill=false < %s | FileCheck %s --check-prefix=NOPT + +; Test that SVE tuple spill/fill pseudo instructions are expanded into +; multi-vector ld1b/st1b instructions when SVE2.1 or SME2 is available. + +define <vscale x 32 x i8> @spill_fill_x2(<vscale x 16 x i8> %unused, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; SVE2P1-LABEL: spill_fill_x2: +; SVE2P1: // %bb.0: +; SVE2P1-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; SVE2P1-NEXT: addvl sp, sp, #-17 +; SVE2P1-NEXT: str p8, [sp, #7, mul vl] // 2-byte Spill +; SVE2P1-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: addvl sp, sp, #-2 +; SVE2P1-NEXT: mov p8.b, p0.b +; SVE2P1-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0] +; SVE2P1-NEXT: ptrue pn8.b +; SVE2P1-NEXT: st1b { z0.b, z1.b }, pn8, [sp] +; SVE2P1-NEXT: //APP +; SVE2P1-NEXT: nop +; SVE2P1-NEXT: //NO_APP +; SVE2P1-NEXT: ptrue pn8.b +; SVE2P1-NEXT: ld1b { z0.b, z1.b }, pn8/z, [sp] +; SVE2P1-NEXT: addvl sp, sp, #2 +; SVE2P1-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Reload +; SVE2P1-NEXT: addvl sp, sp, #17 +; SVE2P1-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; SVE2P1-NEXT: ret +; +; SME2-LABEL: spill_fill_x2: +; SME2: // %bb.0: +; SME2-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; SME2-NEXT: addvl sp, sp, #-17 +; SME2-NEXT: str p8, [sp, #7, mul vl] // 2-byte Spill +; SME2-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; SME2-NEXT: addvl sp, sp, #-2 +; SME2-NEXT: mov p8.b, p0.b +; SME2-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0] +; SME2-NEXT: ptrue pn8.b +; SME2-NEXT: st1b { z0.b, z1.b }, pn8, [sp] +; SME2-NEXT: //APP +; SME2-NEXT: nop +; SME2-NEXT: //NO_APP +; SME2-NEXT: ptrue pn8.b +; SME2-NEXT: ld1b { z0.b, z1.b }, pn8/z, [sp] +; SME2-NEXT: addvl sp, sp, #2 +; SME2-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Reload +; SME2-NEXT: addvl sp, sp, #17 +; SME2-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; SME2-NEXT: ret +; +; NOPT-LABEL: spill_fill_x2: +; NOPT: // %bb.0: +; NOPT-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; NOPT-NEXT: addvl sp, sp, #-17 +; NOPT-NEXT: str p8, [sp, #7, mul vl] // 2-byte Spill +; NOPT-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: addvl sp, sp, #-2 +; NOPT-NEXT: mov p8.b, p0.b +; NOPT-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0] +; NOPT-NEXT: str z0, [sp] +; NOPT-NEXT: str z1, [sp, #1, mul vl] +; NOPT-NEXT: //APP +; NOPT-NEXT: nop +; NOPT-NEXT: //NO_APP +; NOPT-NEXT: ldr z0, [sp] +; NOPT-NEXT: ldr z1, [sp, #1, mul vl] +; NOPT-NEXT: addvl sp, sp, #2 +; NOPT-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Reload +; NOPT-NEXT: addvl sp, sp, #17 +; NOPT-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; NOPT-NEXT: ret + %ld = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr) + %v0 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %ld, 0 + %v1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %ld, 1 + call void asm sideeffect "nop", "~{z0},~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %tuple = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> %v0, i64 0) + %tuple2 = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> %tuple, <vscale x 16 x i8> %v1, i64 16) + ret <vscale x 32 x i8> %tuple2 +} + +define <vscale x 64 x i8> @spill_fill_x4(<vscale x 16 x i8> %unused, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; SVE2P1-LABEL: spill_fill_x4: +; SVE2P1: // %bb.0: +; SVE2P1-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; SVE2P1-NEXT: addvl sp, sp, #-17 +; SVE2P1-NEXT: str p8, [sp, #7, mul vl] // 2-byte Spill +; SVE2P1-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; SVE2P1-NEXT: addvl sp, sp, #-4 +; SVE2P1-NEXT: mov p8.b, p0.b +; SVE2P1-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] +; SVE2P1-NEXT: ptrue pn8.b +; SVE2P1-NEXT: st1b { z0.b - z3.b }, pn8, [sp] +; SVE2P1-NEXT: //APP +; SVE2P1-NEXT: nop +; SVE2P1-NEXT: //NO_APP +; SVE2P1-NEXT: ptrue pn8.b +; SVE2P1-NEXT: ld1b { z0.b - z3.b }, pn8/z, [sp] +; SVE2P1-NEXT: addvl sp, sp, #4 +; SVE2P1-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; SVE2P1-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Reload +; SVE2P1-NEXT: addvl sp, sp, #17 +; SVE2P1-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; SVE2P1-NEXT: ret +; +; SME2-LABEL: spill_fill_x4: +; SME2: // %bb.0: +; SME2-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; SME2-NEXT: addvl sp, sp, #-17 +; SME2-NEXT: str p8, [sp, #7, mul vl] // 2-byte Spill +; SME2-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; SME2-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; SME2-NEXT: addvl sp, sp, #-4 +; SME2-NEXT: mov p8.b, p0.b +; SME2-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] +; SME2-NEXT: ptrue pn8.b +; SME2-NEXT: st1b { z0.b - z3.b }, pn8, [sp] +; SME2-NEXT: //APP +; SME2-NEXT: nop +; SME2-NEXT: //NO_APP +; SME2-NEXT: ptrue pn8.b +; SME2-NEXT: ld1b { z0.b - z3.b }, pn8/z, [sp] +; SME2-NEXT: addvl sp, sp, #4 +; SME2-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; SME2-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Reload +; SME2-NEXT: addvl sp, sp, #17 +; SME2-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; SME2-NEXT: ret +; +; NOPT-LABEL: spill_fill_x4: +; NOPT: // %bb.0: +; NOPT-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; NOPT-NEXT: addvl sp, sp, #-17 +; NOPT-NEXT: str p8, [sp, #7, mul vl] // 2-byte Spill +; NOPT-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; NOPT-NEXT: addvl sp, sp, #-4 +; NOPT-NEXT: mov p8.b, p0.b +; NOPT-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] +; NOPT-NEXT: str z0, [sp] +; NOPT-NEXT: str z1, [sp, #1, mul vl] +; NOPT-NEXT: str z2, [sp, #2, mul vl] +; NOPT-NEXT: str z3, [sp, #3, mul vl] +; NOPT-NEXT: //APP +; NOPT-NEXT: nop +; NOPT-NEXT: //NO_APP +; NOPT-NEXT: ldr z0, [sp] +; NOPT-NEXT: ldr z1, [sp, #1, mul vl] +; NOPT-NEXT: ldr z2, [sp, #2, mul vl] +; NOPT-NEXT: ldr z3, [sp, #3, mul vl] +; NOPT-NEXT: addvl sp, sp, #4 +; NOPT-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; NOPT-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Reload +; NOPT-NEXT: addvl sp, sp, #17 +; NOPT-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; NOPT-NEXT: ret + %ld = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr) + %v0 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %ld, 0 + %v1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %ld, 1 + %v2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %ld, 2 + %v3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %ld, 3 + call void asm sideeffect "nop", "~{z0},~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %t0 = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> %v0, i64 0) + %t1 = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> %t0, <vscale x 16 x i8> %v1, i64 16) + %t2 = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> %t1, <vscale x 16 x i8> %v2, i64 32) + %t3 = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> %t2, <vscale x 16 x i8> %v3, i64 48) + ret <vscale x 64 x i8> %t3 +} |
