aboutsummaryrefslogtreecommitdiff
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp246
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp12
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h3
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp8
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h11
-rw-r--r--llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td15
-rw-r--r--llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll12
-rw-r--r--llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll22
-rw-r--r--llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll79
-rw-r--r--llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll54
-rw-r--r--llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll518
-rw-r--r--llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll43
-rw-r--r--llvm/test/CodeGen/AArch64/sme-streaming-body.ll152
-rw-r--r--llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll131
-rw-r--r--llvm/test/CodeGen/AArch64/sme-streaming-interface.ll49
-rw-r--r--llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll11
-rw-r--r--llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll1177
-rw-r--r--llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll76
18 files changed, 2167 insertions, 452 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index cf617c7..a991813 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -321,7 +321,7 @@ bool AArch64FrameLowering::homogeneousPrologEpilog(
return false;
auto *AFI = MF.getInfo<AArch64FunctionInfo>();
- if (AFI->hasSwiftAsyncContext())
+ if (AFI->hasSwiftAsyncContext() || AFI->hasStreamingModeChanges())
return false;
// If there are an odd number of GPRs before LR and FP in the CSRs list,
@@ -558,6 +558,10 @@ void AArch64FrameLowering::emitCalleeSavedGPRLocations(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ SMEAttrs Attrs(MF.getFunction());
+ bool LocallyStreaming =
+ Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface();
const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
if (CSI.empty())
@@ -569,14 +573,22 @@ void AArch64FrameLowering::emitCalleeSavedGPRLocations(
DebugLoc DL = MBB.findDebugLoc(MBBI);
for (const auto &Info : CSI) {
- if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)
+ unsigned FrameIdx = Info.getFrameIdx();
+ if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector)
continue;
assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
- unsigned DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true);
+ int64_t DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true);
+ int64_t Offset = MFI.getObjectOffset(FrameIdx) - getOffsetOfLocalArea();
+
+ // The location of VG will be emitted before each streaming-mode change in
+ // the function. Only locally-streaming functions require emitting the
+ // non-streaming VG location here.
+ if ((LocallyStreaming && FrameIdx == AFI->getStreamingVGIdx()) ||
+ (!LocallyStreaming &&
+ DwarfReg == TRI.getDwarfRegNum(AArch64::VG, true)))
+ continue;
- int64_t Offset =
- MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -699,6 +711,9 @@ static void emitCalleeSavedRestores(MachineBasicBlock &MBB,
!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
continue;
+ if (!Info.isRestored())
+ continue;
+
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
nullptr, TRI.getDwarfRegNum(Info.getReg(), true)));
BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -1342,6 +1357,32 @@ static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
}
+bool requiresGetVGCall(MachineFunction &MF) {
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ return AFI->hasStreamingModeChanges() &&
+ !MF.getSubtarget<AArch64Subtarget>().hasSVE();
+}
+
+bool isVGInstruction(MachineBasicBlock::iterator MBBI) {
+ unsigned Opc = MBBI->getOpcode();
+ if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI ||
+ Opc == AArch64::UBFMXri)
+ return true;
+
+ if (requiresGetVGCall(*MBBI->getMF())) {
+ if (Opc == AArch64::ORRXrr)
+ return true;
+
+ if (Opc == AArch64::BL) {
+ auto Op1 = MBBI->getOperand(0);
+ return Op1.isSymbol() &&
+ (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg");
+ }
+ }
+
+ return false;
+}
+
// Convert callee-save register save/restore instruction to do stack pointer
// decrement/increment to allocate/deallocate the callee-save stack area by
// converting store/load to use pre/post increment version.
@@ -1352,6 +1393,17 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
int CFAOffset = 0) {
unsigned NewOpc;
+
+ // If the function contains streaming mode changes, we expect instructions
+ // to calculate the value of VG before spilling. For locally-streaming
+ // functions, we need to do this for both the streaming and non-streaming
+ // vector length. Move past these instructions if necessary.
+ MachineFunction &MF = *MBB.getParent();
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ if (AFI->hasStreamingModeChanges())
+ while (isVGInstruction(MBBI))
+ ++MBBI;
+
switch (MBBI->getOpcode()) {
default:
llvm_unreachable("Unexpected callee-save save/restore opcode!");
@@ -1408,7 +1460,6 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
// If the first store isn't right where we want SP then we can't fold the
// update in so create a normal arithmetic instruction instead.
- MachineFunction &MF = *MBB.getParent();
if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
@@ -1660,6 +1711,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
LiveRegs.removeReg(AArch64::X19);
LiveRegs.removeReg(AArch64::FP);
LiveRegs.removeReg(AArch64::LR);
+
+ // X0 will be clobbered by a call to __arm_get_current_vg in the prologue.
+ // This is necessary to spill VG if required where SVE is unavailable, but
+ // X0 is preserved around this call.
+ if (requiresGetVGCall(MF))
+ LiveRegs.removeReg(AArch64::X0);
}
auto VerifyClobberOnExit = make_scope_exit([&]() {
@@ -1846,6 +1903,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// pointer bump above.
while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
!IsSVECalleeSave(MBBI)) {
+ // Move past instructions generated to calculate VG
+ if (AFI->hasStreamingModeChanges())
+ while (isVGInstruction(MBBI))
+ ++MBBI;
+
if (CombineSPBump)
fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
NeedsWinCFI, &HasWinCFI);
@@ -2768,7 +2830,7 @@ struct RegPairInfo {
unsigned Reg2 = AArch64::NoRegister;
int FrameIdx;
int Offset;
- enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
+ enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type;
RegPairInfo() = default;
@@ -2780,6 +2842,7 @@ struct RegPairInfo {
return 2;
case GPR:
case FPR64:
+ case VG:
return 8;
case ZPR:
case FPR128:
@@ -2855,6 +2918,8 @@ static void computeCalleeSaveRegisterPairs(
RPI.Type = RegPairInfo::ZPR;
else if (AArch64::PPRRegClass.contains(RPI.Reg1))
RPI.Type = RegPairInfo::PPR;
+ else if (RPI.Reg1 == AArch64::VG)
+ RPI.Type = RegPairInfo::VG;
else
llvm_unreachable("Unsupported register class.");
@@ -2887,6 +2952,8 @@ static void computeCalleeSaveRegisterPairs(
if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1))
RPI.Reg2 = NextReg;
break;
+ case RegPairInfo::VG:
+ break;
}
}
@@ -3003,6 +3070,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
bool NeedsWinCFI = needsWinCFI(MF);
DebugLoc DL;
SmallVector<RegPairInfo, 8> RegPairs;
@@ -3070,7 +3138,70 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
Size = 2;
Alignment = Align(2);
break;
+ case RegPairInfo::VG:
+ StrOpc = AArch64::STRXui;
+ Size = 8;
+ Alignment = Align(8);
+ break;
}
+
+ unsigned X0Scratch = AArch64::NoRegister;
+ if (Reg1 == AArch64::VG) {
+ // Find an available register to store value of VG to.
+ Reg1 = findScratchNonCalleeSaveRegister(&MBB);
+ assert(Reg1 != AArch64::NoRegister);
+ SMEAttrs Attrs(MF.getFunction());
+
+ if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface() &&
+ AFI->getStreamingVGIdx() == std::numeric_limits<int>::max()) {
+ // For locally-streaming functions, we need to store both the streaming
+ // & non-streaming VG. Spill the streaming value first.
+ BuildMI(MBB, MI, DL, TII.get(AArch64::RDSVLI_XI), Reg1)
+ .addImm(1)
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MI, DL, TII.get(AArch64::UBFMXri), Reg1)
+ .addReg(Reg1)
+ .addImm(3)
+ .addImm(63)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ AFI->setStreamingVGIdx(RPI.FrameIdx);
+ } else if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
+ BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1)
+ .addImm(31)
+ .addImm(1)
+ .setMIFlag(MachineInstr::FrameSetup);
+ AFI->setVGIdx(RPI.FrameIdx);
+ } else {
+ const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+ if (llvm::any_of(
+ MBB.liveins(),
+ [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
+ return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
+ AArch64::X0, LiveIn.PhysReg);
+ }))
+ X0Scratch = Reg1;
+
+ if (X0Scratch != AArch64::NoRegister)
+ BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), Reg1)
+ .addReg(AArch64::XZR)
+ .addReg(AArch64::X0, RegState::Undef)
+ .addReg(AArch64::X0, RegState::Implicit)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ const uint32_t *RegMask = TRI->getCallPreservedMask(
+ MF,
+ CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1);
+ BuildMI(MBB, MI, DL, TII.get(AArch64::BL))
+ .addExternalSymbol("__arm_get_current_vg")
+ .addRegMask(RegMask)
+ .addReg(AArch64::X0, RegState::ImplicitDefine)
+ .setMIFlag(MachineInstr::FrameSetup);
+ Reg1 = AArch64::X0;
+ AFI->setVGIdx(RPI.FrameIdx);
+ }
+ }
+
LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
dbgs() << ") -> fi#(" << RPI.FrameIdx;
@@ -3162,6 +3293,13 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
if (RPI.isPaired())
MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
}
+
+ if (X0Scratch != AArch64::NoRegister)
+ BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), AArch64::X0)
+ .addReg(AArch64::XZR)
+ .addReg(X0Scratch, RegState::Undef)
+ .addReg(X0Scratch, RegState::Implicit)
+ .setMIFlag(MachineInstr::FrameSetup);
}
return true;
}
@@ -3241,6 +3379,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
Size = 2;
Alignment = Align(2);
break;
+ case RegPairInfo::VG:
+ continue;
}
LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
@@ -3440,6 +3580,19 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
CSStackSize += RegSize;
}
+ // Increase the callee-saved stack size if the function has streaming mode
+ // changes, as we will need to spill the value of the VG register.
+ // For locally streaming functions, we spill both the streaming and
+ // non-streaming VG value.
+ const Function &F = MF.getFunction();
+ SMEAttrs Attrs(F);
+ if (AFI->hasStreamingModeChanges()) {
+ if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
+ CSStackSize += 16;
+ else
+ CSStackSize += 8;
+ }
+
// Save number of saved regs, so we can easily update CSStackSize later.
unsigned NumSavedRegs = SavedRegs.count();
@@ -3576,6 +3729,33 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
}
+ // Insert VG into the list of CSRs, immediately before LR if saved.
+ if (AFI->hasStreamingModeChanges()) {
+ std::vector<CalleeSavedInfo> VGSaves;
+ SMEAttrs Attrs(MF.getFunction());
+
+ auto VGInfo = CalleeSavedInfo(AArch64::VG);
+ VGInfo.setRestored(false);
+ VGSaves.push_back(VGInfo);
+
+ // Add VG again if the function is locally-streaming, as we will spill two
+ // values.
+ if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
+ VGSaves.push_back(VGInfo);
+
+ bool InsertBeforeLR = false;
+
+ for (unsigned I = 0; I < CSI.size(); I++)
+ if (CSI[I].getReg() == AArch64::LR) {
+ InsertBeforeLR = true;
+ CSI.insert(CSI.begin() + I, VGSaves.begin(), VGSaves.end());
+ break;
+ }
+
+ if (!InsertBeforeLR)
+ CSI.insert(CSI.end(), VGSaves.begin(), VGSaves.end());
+ }
+
for (auto &CS : CSI) {
Register Reg = CS.getReg();
const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
@@ -4191,12 +4371,58 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
}
} // namespace
+MachineBasicBlock::iterator emitVGSaveRestore(MachineBasicBlock::iterator II,
+ const AArch64FrameLowering *TFI) {
+ MachineInstr &MI = *II;
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineFunction *MF = MBB->getParent();
+
+ if (MI.getOpcode() != AArch64::VGSavePseudo &&
+ MI.getOpcode() != AArch64::VGRestorePseudo)
+ return II;
+
+ SMEAttrs FuncAttrs(MF->getFunction());
+ bool LocallyStreaming =
+ FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface();
+ const AArch64FunctionInfo *AFI = MF->getInfo<AArch64FunctionInfo>();
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+ const AArch64InstrInfo *TII =
+ MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+ int64_t VGFrameIdx =
+ LocallyStreaming ? AFI->getStreamingVGIdx() : AFI->getVGIdx();
+ assert(VGFrameIdx != std::numeric_limits<int>::max() &&
+ "Expected FrameIdx for VG");
+
+ unsigned CFIIndex;
+ if (MI.getOpcode() == AArch64::VGSavePseudo) {
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
+ int64_t Offset =
+ MFI.getObjectOffset(VGFrameIdx) - TFI->getOffsetOfLocalArea();
+ CFIIndex = MF->addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, TRI->getDwarfRegNum(AArch64::VG, true), Offset));
+ } else
+ CFIIndex = MF->addFrameInst(MCCFIInstruction::createRestore(
+ nullptr, TRI->getDwarfRegNum(AArch64::VG, true)));
+
+ MachineInstr *UnwindInst = BuildMI(*MBB, II, II->getDebugLoc(),
+ TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ MI.eraseFromParent();
+ return UnwindInst->getIterator();
+}
+
void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
MachineFunction &MF, RegScavenger *RS = nullptr) const {
- if (StackTaggingMergeSetTag)
- for (auto &BB : MF)
- for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ for (auto &BB : MF)
+ for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) {
+ if (AFI->hasStreamingModeChanges())
+ II = emitVGSaveRestore(II, this);
+ if (StackTaggingMergeSetTag)
II = tryMergeAdjacentSTG(II, this, RS);
+ }
}
/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c4f819f..af8b9d9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2493,6 +2493,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::FIRST_NUMBER:
break;
MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
+ MAKE_CASE(AArch64ISD::VG_SAVE)
+ MAKE_CASE(AArch64ISD::VG_RESTORE)
MAKE_CASE(AArch64ISD::SMSTART)
MAKE_CASE(AArch64ISD::SMSTOP)
MAKE_CASE(AArch64ISD::RESTORE_ZA)
@@ -8514,6 +8516,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
SDValue InGlue;
if (RequiresSMChange) {
+
+ Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL,
+ DAG.getVTList(MVT::Other, MVT::Glue), Chain);
+ InGlue = Chain.getValue(1);
+
SDValue NewChain = changeStreamingMode(
DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
@@ -8691,6 +8698,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Result = changeStreamingMode(
DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
+ InGlue = Result.getValue(1);
+
+ Result =
+ DAG.getNode(AArch64ISD::VG_RESTORE, DL,
+ DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue});
}
if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 48a4ea9..b57ba09 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -70,6 +70,9 @@ enum NodeType : unsigned {
COALESCER_BARRIER,
+ VG_SAVE,
+ VG_RESTORE,
+
SMSTART,
SMSTOP,
RESTORE_ZA,
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index c3d64f5..957d7bc 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -196,12 +196,14 @@ bool AArch64FunctionInfo::needsAsyncDwarfUnwindInfo(
const MachineFunction &MF) const {
if (!NeedsAsyncDwarfUnwindInfo) {
const Function &F = MF.getFunction();
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
// The check got "minsize" is because epilogue unwind info is not emitted
// (yet) for homogeneous epilogues, outlined functions, and functions
// outlined from.
- NeedsAsyncDwarfUnwindInfo = needsDwarfUnwindInfo(MF) &&
- F.getUWTableKind() == UWTableKind::Async &&
- !F.hasMinSize();
+ NeedsAsyncDwarfUnwindInfo =
+ needsDwarfUnwindInfo(MF) &&
+ ((F.getUWTableKind() == UWTableKind::Async && !F.hasMinSize()) ||
+ AFI->hasStreamingModeChanges());
}
return *NeedsAsyncDwarfUnwindInfo;
}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index df09fc5..839a3a3 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -13,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
+#include "AArch64Subtarget.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
@@ -216,6 +217,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// The PTRUE is used for the LD/ST of ZReg pairs in save and restore.
unsigned PredicateRegForFillSpill = 0;
+ // The stack slots where VG values are stored to.
+ int64_t VGIdx = std::numeric_limits<int>::max();
+ int64_t StreamingVGIdx = std::numeric_limits<int>::max();
+
public:
AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI);
@@ -234,6 +239,12 @@ public:
Register getPStateSMReg() const { return PStateSMReg; };
void setPStateSMReg(Register Reg) { PStateSMReg = Reg; };
+ int64_t getVGIdx() const { return VGIdx; };
+ void setVGIdx(unsigned Idx) { VGIdx = Idx; };
+
+ int64_t getStreamingVGIdx() const { return StreamingVGIdx; };
+ void setStreamingVGIdx(unsigned FrameIdx) { StreamingVGIdx = FrameIdx; };
+
bool isSVECC() const { return IsSVECC; };
void setIsSVECC(bool s) { IsSVECC = s; };
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 2b70c47..fea70b7 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -31,6 +31,12 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
def AArch64CoalescerBarrier
: SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
+ [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>,
+ [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
+
//===----------------------------------------------------------------------===//
// Instruction naming conventions.
//===----------------------------------------------------------------------===//
@@ -221,6 +227,15 @@ def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 /*AArch64SME::Always*/0)),
(MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>;
+// Pseudo to insert cfi_offset/cfi_restore instructions. Used to save or restore
+// the streaming value of VG around streaming-mode changes in locally-streaming
+// functions.
+def VGSavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+def : Pat<(AArch64VGSave), (VGSavePseudo)>;
+
+def VGRestorePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+def : Pat<(AArch64VGRestore), (VGRestorePseudo)>;
+
//===----------------------------------------------------------------------===//
// SME2 Instructions
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll b/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll
index 44d47a0..aae1a66 100644
--- a/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll
+++ b/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll
@@ -7,10 +7,11 @@ define void @streaming_mode_change1() #0 {
; CHECK-LABEL: streaming_mode_change1:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl callee
; CHECK-NEXT: smstart sm
@@ -23,6 +24,7 @@ define void @streaming_mode_change1() #0 {
;
; OUTLINER-LABEL: streaming_mode_change1:
; OUTLINER-NOT: OUTLINED_FUNCTION_
+;
call void @callee();
ret void;
}
@@ -31,10 +33,11 @@ define void @streaming_mode_change2() #0 {
; CHECK-LABEL: streaming_mode_change2:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl callee
; CHECK-NEXT: smstart sm
@@ -47,6 +50,7 @@ define void @streaming_mode_change2() #0 {
;
; OUTLINER-LABEL: streaming_mode_change2:
; OUTLINER-NOT: OUTLINED_FUNCTION_
+;
call void @callee();
ret void;
}
@@ -55,10 +59,11 @@ define void @streaming_mode_change3() #0 {
; CHECK-LABEL: streaming_mode_change3:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl callee
; CHECK-NEXT: smstart sm
@@ -71,6 +76,7 @@ define void @streaming_mode_change3() #0 {
;
; OUTLINER-LABEL: streaming_mode_change3:
; OUTLINER-NOT: OUTLINED_FUNCTION_
+;
call void @callee();
ret void;
}
diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
index 0737719..c4440e7 100644
--- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
+++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
@@ -10,11 +10,13 @@ target triple = "aarch64"
define void @streaming_compatible() #0 {
; CHECK-LABEL: streaming_compatible:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: bl __arm_get_current_vg
+; CHECK-NEXT: stp x0, x19, [sp, #72] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB0_2
@@ -26,11 +28,12 @@ define void @streaming_compatible() #0 {
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB0_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @non_streaming()
ret void
@@ -44,12 +47,14 @@ declare void @non_streaming()
define void @streaming_compatible_arg(float %f) #0 {
; CHECK-LABEL: streaming_compatible_arg:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: bl __arm_get_current_vg
+; CHECK-NEXT: stp x0, x19, [sp, #88] // 16-byte Folded Spill
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
@@ -63,12 +68,13 @@ define void @streaming_compatible_arg(float %f) #0 {
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB1_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
call void @non_streaming(float %f)
ret void
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 254e37e..d786ffd 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -fast-isel=true -global-isel=false -fast-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s \
+; RUN: llc -fast-isel=true -global-isel=false -fast-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 < %s \
; RUN: | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-FISEL
-; RUN: llc -fast-isel=false -global-isel=true -global-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s \
+; RUN: llc -fast-isel=false -global-isel=true -global-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 < %s \
; RUN: | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-GISEL
@@ -17,6 +17,8 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
; CHECK-FISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-FISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-FISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-FISEL-NEXT: cntd x9
+; CHECK-FISEL-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
; CHECK-FISEL-NEXT: str d0, [sp] // 8-byte Folded Spill
; CHECK-FISEL-NEXT: smstart sm
; CHECK-FISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload
@@ -43,6 +45,8 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
; CHECK-GISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-GISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-GISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-GISEL-NEXT: cntd x9
+; CHECK-GISEL-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
; CHECK-GISEL-NEXT: str d0, [sp] // 8-byte Folded Spill
; CHECK-GISEL-NEXT: smstart sm
; CHECK-GISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload
@@ -76,6 +80,8 @@ define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: cntd x9
+; CHECK-COMMON-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: str d0, [sp] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
; CHECK-COMMON-NEXT: ldr d0, [sp] // 8-byte Folded Reload
@@ -102,12 +108,17 @@ entry:
define double @locally_streaming_caller_normal_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_body" {
; CHECK-COMMON-LABEL: locally_streaming_caller_normal_callee:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: sub sp, sp, #112
+; CHECK-COMMON-NEXT: sub sp, sp, #128
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: rdsvl x9, #1
+; CHECK-COMMON-NEXT: lsr x9, x9, #3
+; CHECK-COMMON-NEXT: str x9, [sp, #104] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: cntd x9
+; CHECK-COMMON-NEXT: str x9, [sp, #112] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: str d0, [sp, #24] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
; CHECK-COMMON-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload
@@ -129,7 +140,7 @@ define double @locally_streaming_caller_normal_callee(double %x) nounwind noinli
; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT: add sp, sp, #112
+; CHECK-COMMON-NEXT: add sp, sp, #128
; CHECK-COMMON-NEXT: ret
%call = call double @normal_callee(double %x);
%add = fadd double %call, 4.200000e+01
@@ -166,11 +177,16 @@ define double @normal_caller_to_locally_streaming_callee(double %x) nounwind noi
define void @locally_streaming_caller_streaming_callee_ptr(ptr %p) nounwind noinline optnone "aarch64_pstate_sm_body" {
; CHECK-COMMON-LABEL: locally_streaming_caller_streaming_callee_ptr:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: rdsvl x9, #1
+; CHECK-COMMON-NEXT: lsr x9, x9, #3
+; CHECK-COMMON-NEXT: str x9, [sp, #72] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: cntd x9
+; CHECK-COMMON-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
; CHECK-COMMON-NEXT: blr x0
; CHECK-COMMON-NEXT: smstop sm
@@ -178,7 +194,7 @@ define void @locally_streaming_caller_streaming_callee_ptr(ptr %p) nounwind noin
; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
call void %p() "aarch64_pstate_sm_enabled"
ret void
@@ -192,6 +208,8 @@ define void @normal_call_to_streaming_callee_ptr(ptr %p) nounwind noinline optno
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: cntd x9
+; CHECK-COMMON-NEXT: str x9, [sp, #72] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
; CHECK-COMMON-NEXT: blr x0
; CHECK-COMMON-NEXT: smstop sm
@@ -214,7 +232,8 @@ declare double @za_shared_callee(double) "aarch64_inout_za"
define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_new_za"{
; CHECK-COMMON-LABEL: za_new_caller_to_za_shared_callee:
; CHECK-COMMON: // %bb.0: // %prelude
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
@@ -240,7 +259,8 @@ define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
; CHECK-COMMON-NEXT: fadd d0, d0, d1
; CHECK-COMMON-NEXT: smstop za
; CHECK-COMMON-NEXT: mov sp, x29
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
entry:
%call = call double @za_shared_callee(double %x)
@@ -251,7 +271,8 @@ entry:
define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline optnone "aarch64_inout_za"{
; CHECK-COMMON-LABEL: za_shared_caller_to_za_none_callee:
; CHECK-COMMON: // %bb.0: // %entry
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
@@ -279,7 +300,8 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
; CHECK-COMMON-NEXT: fmov d1, x8
; CHECK-COMMON-NEXT: fadd d0, d0, d1
; CHECK-COMMON-NEXT: mov sp, x29
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
entry:
%call = call double @normal_callee(double %x)
@@ -291,7 +313,8 @@ entry:
define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-LABEL: f128_call_za:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
@@ -314,7 +337,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-NEXT: .LBB8_2:
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr
; CHECK-COMMON-NEXT: mov sp, x29
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
%res = fadd fp128 %a, %b
ret fp128 %res
@@ -326,21 +350,22 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
; CHECK-COMMON-LABEL: f128_call_sm:
; CHECK-COMMON: // %bb.0:
; CHECK-COMMON-NEXT: sub sp, sp, #112
+; CHECK-COMMON-NEXT: cntd x9
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
; CHECK-COMMON-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload
; CHECK-COMMON-NEXT: bl __addtf3
; CHECK-COMMON-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
-; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: add sp, sp, #112
@@ -353,7 +378,8 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-LABEL: frem_call_za:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
@@ -376,7 +402,8 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-NEXT: .LBB10_2:
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr
; CHECK-COMMON-NEXT: mov sp, x29
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
%res = frem double %a, %b
ret double %res
@@ -387,21 +414,22 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw
; CHECK-COMMON-LABEL: frem_call_sm:
; CHECK-COMMON: // %bb.0:
; CHECK-COMMON-NEXT: sub sp, sp, #96
+; CHECK-COMMON-NEXT: cntd x9
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
; CHECK-COMMON-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: bl fmodf
; CHECK-COMMON-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
-; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
-; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: add sp, sp, #96
@@ -414,12 +442,14 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw
define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-COMMON-LABEL: frem_call_sm_compat:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: sub sp, sp, #96
+; CHECK-COMMON-NEXT: sub sp, sp, #112
+; CHECK-COMMON-NEXT: cntd x9
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: bl __arm_sme_state
; CHECK-COMMON-NEXT: and x19, x0, #0x1
@@ -434,13 +464,14 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati
; CHECK-COMMON-NEXT: // %bb.3:
; CHECK-COMMON-NEXT: smstart sm
; CHECK-COMMON-NEXT: .LBB12_4:
-; CHECK-COMMON-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT: add sp, sp, #96
+; CHECK-COMMON-NEXT: add sp, sp, #112
; CHECK-COMMON-NEXT: ret
%res = frem float %a, %b
ret float %res
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 9d635f0..b0d6e04 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64 -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve -mattr=+sme < %s | FileCheck %s
declare void @private_za_callee()
declare float @llvm.cos.f32(float)
@@ -8,7 +8,8 @@ declare float @llvm.cos.f32(float)
define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
; CHECK-LABEL: test_lazy_save_1_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
@@ -31,7 +32,8 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @private_za_callee()
ret void
@@ -41,20 +43,21 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
; CHECK-LABEL: test_lazy_save_2_callees:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: rdsvl x19, #1
+; CHECK-NEXT: rdsvl x20, #1
; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: msub x8, x19, x19, x8
+; CHECK-NEXT: msub x8, x20, x20, x8
; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: sub x20, x29, #16
+; CHECK-NEXT: sub x21, x29, #16
; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh wzr, [x29, #-6]
; CHECK-NEXT: stur x8, [x29, #-16]
-; CHECK-NEXT: sturh w19, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x20
+; CHECK-NEXT: sturh w20, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x21
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -64,8 +67,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
; CHECK-NEXT: bl __arm_tpidr2_restore
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: sturh w19, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x20
+; CHECK-NEXT: sturh w20, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x21
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -76,8 +79,9 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @private_za_callee()
call void @private_za_callee()
@@ -88,7 +92,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inout_za" {
; CHECK-LABEL: test_lazy_save_expanded_intrinsic:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
@@ -111,7 +116,8 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
; CHECK-NEXT: .LBB2_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call float @llvm.cos.f32(float %a)
ret float %res
@@ -121,13 +127,15 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za" "aarch64_pstate_sm_compatible" {
; CHECK-LABEL: test_lazy_save_and_conditional_smstart:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: add x29, sp, #64
-; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
@@ -140,13 +148,13 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: sturh w8, [x29, #-72]
; CHECK-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
-; CHECK-NEXT: tbz w19, #0, .LBB3_2
+; CHECK-NEXT: and x20, x0, #0x1
+; CHECK-NEXT: tbz w20, #0, .LBB3_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB3_2:
; CHECK-NEXT: bl private_za_callee
-; CHECK-NEXT: tbz w19, #0, .LBB3_4
+; CHECK-NEXT: tbz w20, #0, .LBB3_4
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB3_4:
@@ -159,12 +167,12 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: .LBB3_6:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: sub sp, x29, #64
+; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @private_za_callee()
ret void
diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
index 1d1bae4..500c511 100644
--- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
+++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
@@ -16,11 +16,12 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: mov x19, x1
@@ -28,12 +29,12 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl use_i8
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -49,11 +50,12 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: mov x19, x1
@@ -61,12 +63,12 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 {
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl use_i16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -82,11 +84,12 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: mov x19, x1
@@ -94,12 +97,12 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 {
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl use_i32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -115,11 +118,12 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: mov x19, x1
@@ -127,12 +131,12 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 {
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl use_i64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -148,11 +152,12 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
@@ -165,14 +170,14 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload
; CHECK-NEXT: bl use_f16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -188,11 +193,12 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
@@ -205,14 +211,14 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
; CHECK-NEXT: bl use_f32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -228,11 +234,12 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -245,14 +252,14 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_f64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -273,11 +280,12 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1i8:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -290,14 +298,14 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v16i8
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -314,11 +322,12 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1i16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -331,14 +340,14 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v8i16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -355,11 +364,12 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1i32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -372,14 +382,14 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v4i32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -396,11 +406,12 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -413,14 +424,14 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v2i64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -437,11 +448,12 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1f16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
@@ -454,14 +466,14 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload
; CHECK-NEXT: bl use_v8f16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -478,11 +490,12 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1f32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -495,14 +508,14 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v4f32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -519,11 +532,12 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -536,14 +550,14 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v2f64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -564,11 +578,12 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -581,14 +596,14 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v16i8
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -604,11 +619,12 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -621,14 +637,14 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v8i16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -644,11 +660,12 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -661,14 +678,14 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v4i32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -684,11 +701,12 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -701,14 +719,14 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v2i64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -724,11 +742,12 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -741,14 +760,14 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v8f16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -764,11 +783,12 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -781,14 +801,14 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v8bf16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -804,11 +824,12 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -821,14 +842,14 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v4f32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -844,11 +865,12 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -861,14 +883,14 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v2f64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -887,11 +909,12 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v8i1:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -900,10 +923,10 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
-; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: and z1.b, z1.b, #0x1
; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0
; CHECK-NEXT: str p0, [x8, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v8i1
@@ -913,8 +936,8 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
; CHECK-NEXT: str p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -933,23 +956,26 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
define void @dont_coalesce_res_i8(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_i8
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call i8 @get_i8()
%vec = insertelement <vscale x 16 x i8> poison, i8 %res, i32 0
@@ -960,23 +986,26 @@ define void @dont_coalesce_res_i8(ptr %ptr) #0 {
define void @dont_coalesce_res_i16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_i16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call i16 @get_i16()
%vec = insertelement <vscale x 8 x i16> poison, i16 %res, i32 0
@@ -987,23 +1016,26 @@ define void @dont_coalesce_res_i16(ptr %ptr) #0 {
define void @dont_coalesce_res_i32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_i32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call i32 @get_i32()
%vec = insertelement <vscale x 4 x i32> poison, i32 %res, i32 0
@@ -1014,23 +1046,26 @@ define void @dont_coalesce_res_i32(ptr %ptr) #0 {
define void @dont_coalesce_res_i64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_i64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call i64 @get_i64()
%vec = insertelement <vscale x 2 x i64> poison, i64 %res, i32 0
@@ -1041,27 +1076,30 @@ define void @dont_coalesce_res_i64(ptr %ptr) #0 {
define void @dont_coalesce_res_f16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_f16
; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload
+; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call half @get_f16()
%vec = insertelement <vscale x 8 x half> poison, half %res, i32 0
@@ -1072,12 +1110,14 @@ define void @dont_coalesce_res_f16(ptr %ptr) #0 {
define void @dont_coalesce_res_f32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_f32
@@ -1087,11 +1127,12 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 {
; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call float @get_f32()
%vec = insertelement <vscale x 4 x float> poison, float %res, i32 0
@@ -1102,12 +1143,14 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 {
define void @dont_coalesce_res_f64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_f64
@@ -1117,11 +1160,12 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call double @get_f64()
%vec = insertelement <vscale x 2 x double> poison, double %res, i32 0
@@ -1136,12 +1180,14 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 {
define void @dont_coalesce_res_v1i8(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1i8
@@ -1151,11 +1197,12 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x i8> @get_v1i8()
%elt = extractelement <1 x i8> %res, i32 0
@@ -1167,12 +1214,14 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 {
define void @dont_coalesce_res_v1i16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1i16
@@ -1182,11 +1231,12 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x i16> @get_v1i16()
%elt = extractelement <1 x i16> %res, i32 0
@@ -1198,12 +1248,14 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 {
define void @dont_coalesce_res_v1i32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1i32
@@ -1213,11 +1265,12 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x i32> @get_v1i32()
%elt = extractelement <1 x i32> %res, i32 0
@@ -1229,12 +1282,14 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 {
define void @dont_coalesce_res_v1i64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1i64
@@ -1244,11 +1299,12 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x i64> @get_v1i64()
%elt = extractelement <1 x i64> %res, i32 0
@@ -1260,27 +1316,30 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 {
define void @dont_coalesce_res_v1f16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1f16
; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload
+; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x half> @get_v1f16()
%elt = extractelement <1 x half> %res, i32 0
@@ -1292,12 +1351,14 @@ define void @dont_coalesce_res_v1f16(ptr %ptr) #0 {
define void @dont_coalesce_res_v1f32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1f32
@@ -1307,11 +1368,12 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x float> @get_v1f32()
%elt = extractelement <1 x float> %res, i32 0
@@ -1323,12 +1385,14 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 {
define void @dont_coalesce_res_v1f64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1f64
@@ -1338,11 +1402,12 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x double> @get_v1f64()
%elt = extractelement <1 x double> %res, i32 0
@@ -1358,27 +1423,30 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 {
define void @dont_coalesce_res_v16i8(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v16i8
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <16 x i8> @get_v16i8()
%vec = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> %res, i64 0)
@@ -1389,27 +1457,30 @@ define void @dont_coalesce_res_v16i8(ptr %ptr) #0 {
define void @dont_coalesce_res_v8i16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v8i16
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <8 x i16> @get_v8i16()
%vec = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> poison, <8 x i16> %res, i64 0)
@@ -1420,27 +1491,30 @@ define void @dont_coalesce_res_v8i16(ptr %ptr) #0 {
define void @dont_coalesce_res_v4i32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v4i32
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <4 x i32> @get_v4i32()
%vec = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> %res, i64 0)
@@ -1451,27 +1525,30 @@ define void @dont_coalesce_res_v4i32(ptr %ptr) #0 {
define void @dont_coalesce_res_v2i64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v2i64
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <2 x i64> @get_v2i64()
%vec = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> %res, i64 0)
@@ -1482,27 +1559,30 @@ define void @dont_coalesce_res_v2i64(ptr %ptr) #0 {
define void @dont_coalesce_res_v8f16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v8f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v8f16
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <8 x half> @get_v8f16()
%vec = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %res, i64 0)
@@ -1513,27 +1593,30 @@ define void @dont_coalesce_res_v8f16(ptr %ptr) #0 {
define void @dont_coalesce_res_v4f32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v4f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v4f32
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <4 x float> @get_v4f32()
%vec = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> %res, i64 0)
@@ -1544,27 +1627,30 @@ define void @dont_coalesce_res_v4f32(ptr %ptr) #0 {
define void @dont_coalesce_res_v2f64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v2f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v2f64
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <2 x double> @get_v2f64()
%vec = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> poison, <2 x double> %res, i64 0)
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
index d675733..6c8aff5 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
declare void @normal_callee();
declare void @streaming_callee() "aarch64_pstate_sm_enabled";
@@ -8,11 +8,15 @@ declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible";
define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: sm_body_sm_compatible_simple:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x8, x0, #0x1
; CHECK-NEXT: tbnz w8, #0, .LBB0_2
@@ -28,7 +32,7 @@ define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aar
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
ret float zeroinitializer
}
@@ -36,11 +40,15 @@ define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aar
define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: sm_body_caller_sm_compatible_caller_normal_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB1_2
@@ -54,11 +62,12 @@ define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB1_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @normal_callee()
ret void
@@ -68,12 +77,16 @@ define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate
define void @streaming_body_and_streaming_compatible_interface_multi_basic_block(i32 noundef %x) "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: streaming_body_and_streaming_compatible_interface_multi_basic_block:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB2_2
@@ -87,11 +100,12 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block
; CHECK-NEXT: // %bb.4: // %if.else
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB2_5: // %if.else
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB2_6: // %if.then
; CHECK-NEXT: smstop sm
@@ -101,11 +115,12 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block
; CHECK-NEXT: // %bb.7: // %if.then
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB2_8: // %if.then
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
entry:
%cmp = icmp eq i32 %x, 0
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
index cd6d45f5..3afd571 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
declare void @normal_callee();
declare void @streaming_callee() "aarch64_pstate_sm_enabled";
@@ -8,11 +8,15 @@ declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible";
define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_streaming_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl streaming_compatible_callee
; CHECK-NEXT: bl streaming_compatible_callee
@@ -21,7 +25,7 @@ define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @streaming_compatible_callee();
@@ -47,26 +51,33 @@ define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_
define void @locally_streaming_multiple_exit(i64 %cond) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_multiple_exit:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: lsr x9, x9, #3
+; CHECK-NEXT: str x9, [sp, #-80]! // 8-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: cmp x0, #1
; CHECK-NEXT: b.ne .LBB2_2
; CHECK-NEXT: // %bb.1: // %if.else
; CHECK-NEXT: smstop sm
-; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #80
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB2_2: // %if.end
; CHECK-NEXT: smstop sm
-; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #80
; CHECK-NEXT: ret
entry:
@@ -87,11 +98,16 @@ if.end:
define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_no_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #80
-; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
+; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: index z0.d, #0, #1
@@ -102,12 +118,12 @@ define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_psta
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
-; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #80
+; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #96
; CHECK-NEXT: ret
%add = add <2 x i64> %a, <i64 41, i64 42>;
@@ -120,11 +136,15 @@ define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_psta
define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_locally_streaming_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl locally_streaming_caller_streaming_callee
@@ -134,7 +154,7 @@ define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @locally_streaming_caller_streaming_callee();
@@ -151,12 +171,16 @@ define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate
define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_compatible_callee_vec_args_ret:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
@@ -169,7 +193,7 @@ define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_compatible"
ret <2 x i64> %res;
@@ -180,12 +204,16 @@ declare <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64>) "aarch64_
define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct_arg_ret({<2 x i64>, <2 x i64>} %arg) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_compatible_callee_struct_arg_ret:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: sub sp, sp, #128
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #112] // 8-byte Folded Spill
; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
@@ -198,7 +226,7 @@ define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct
; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #112
+; CHECK-NEXT: add sp, sp, #128
; CHECK-NEXT: ret
%v1.arg = extractvalue {<2 x i64>, <2 x i64>} %arg, 1
%res = call {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64> %v1.arg) "aarch64_pstate_sm_compatible"
@@ -212,11 +240,16 @@ declare {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<
define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body" {
; CHECK-LABEL: locally_streaming_caller_alloca:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
; CHECK-NEXT: addsvl sp, sp, #-1
; CHECK-NEXT: smstart sm
; CHECK-NEXT: mov x0, sp
@@ -227,7 +260,7 @@ define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body"
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
%alloca = alloca <vscale x 4 x i32>
call void @use_ptr(ptr %alloca) "aarch64_pstate_sm_compatible"
@@ -239,12 +272,16 @@ declare void @use_ptr(ptr) "aarch64_pstate_sm_compatible"
define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_body" {
; CHECK-LABEL: call_to_intrinsic_without_chain:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: smstop sm
@@ -259,7 +296,7 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
entry:
%0 = call fast double @llvm.cos.f64(double %x)
@@ -272,11 +309,16 @@ declare double @llvm.cos.f64(double)
define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstate_sm_body" {
; CHECK-LABEL: test_arg_survives_loop:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #80
-; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
+; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB9_1: // %for.body
@@ -289,12 +331,12 @@ define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstat
; CHECK-NEXT: fadd s0, s1, s0
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
; CHECK-NEXT: smstop sm
-; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #80
+; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #96
; CHECK-NEXT: ret
entry:
br label %for.body
@@ -314,11 +356,15 @@ for.cond.cleanup:
define void @disable_tailcallopt() "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: disable_tailcallopt:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl streaming_compatible_callee
; CHECK-NEXT: smstop sm
@@ -326,7 +372,7 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_body" nounwind {
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
tail call void @streaming_compatible_callee();
ret void;
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 1e16f14..58992eb 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -mattr=+sve -mattr=+sme < %s | FileCheck %s
; This file tests the following combinations related to streaming-enabled functions:
; [ ] N -> SC (Normal -> Streaming-compatible)
@@ -36,11 +36,13 @@ define void @normal_caller_streaming_compatible_callee() nounwind {
define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_caller_normal_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB1_2
@@ -52,11 +54,12 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB1_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @normal_callee();
@@ -72,11 +75,13 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp
define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_caller_streaming_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB2_2
@@ -88,11 +93,12 @@ define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_c
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB2_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @streaming_callee();
@@ -124,11 +130,12 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
; CHECK-LABEL: streaming_compatible_with_neon_vectors:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: add x8, sp, #16
@@ -136,10 +143,10 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: add x8, sp, #16
-; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB4_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
@@ -160,8 +167,8 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -176,8 +183,9 @@ declare <2 x double> @normal_callee_vec_arg(<2 x double>)
define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale x 2 x double> %arg) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_with_scalable_vectors:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-18
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -255,8 +263,8 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
-; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x double> %arg)
%fadd = fadd <vscale x 2 x double> %res, %arg
@@ -268,8 +276,9 @@ declare <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x doub
define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x 2 x i1> %arg) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_with_predicate_vectors:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-18
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -347,8 +356,8 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
-; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1> %arg)
%and = and <vscale x 2 x i1> %res, %arg
@@ -360,11 +369,13 @@ declare <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1>)
define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: conditional_smstart_unreachable_block:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB7_2
@@ -372,6 +383,10 @@ define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatibl
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB7_2:
; CHECK-NEXT: bl streaming_callee
+; CHECK-NEXT: tbnz w19, #0, .LBB7_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB7_4:
call void @streaming_callee()
unreachable
}
@@ -381,11 +396,13 @@ define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_co
; CHECK: // %bb.0:
; CHECK-NEXT: tbz w0, #0, .LBB8_6
; CHECK-NEXT: // %bb.1: // %if.then
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB8_3
@@ -397,11 +414,12 @@ define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_co
; CHECK-NEXT: // %bb.4: // %if.then
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB8_5: // %if.then
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: .LBB8_6: // %exit
; CHECK-NEXT: ret
br i1 %p, label %if.then, label %exit
@@ -417,11 +435,13 @@ exit:
define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: disable_tailcallopt:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB9_2
@@ -433,11 +453,12 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB9_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
tail call void @normal_callee();
@@ -447,29 +468,32 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) "aarch64_pstate_sm_compatible" {
; CHECK-LABEL: call_to_non_streaming_pass_args:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: sub sp, sp, #128
+; CHECK-NEXT: .cfi_def_cfa_offset 128
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 112
-; CHECK-NEXT: .cfi_offset w19, -8
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: .cfi_offset b8, -24
-; CHECK-NEXT: .cfi_offset b9, -32
-; CHECK-NEXT: .cfi_offset b10, -40
-; CHECK-NEXT: .cfi_offset b11, -48
-; CHECK-NEXT: .cfi_offset b12, -56
-; CHECK-NEXT: .cfi_offset b13, -64
-; CHECK-NEXT: .cfi_offset b14, -72
-; CHECK-NEXT: .cfi_offset b15, -80
+; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #112] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x8, x1
; CHECK-NEXT: mov x9, x0
; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: .cfi_offset vg, -24
; CHECK-NEXT: tbz w19, #0, .LBB10_2
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: smstop sm
@@ -483,12 +507,25 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
; CHECK-NEXT: // %bb.3: // %entry
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB10_4: // %entry
-; CHECK-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_restore vg
; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #112] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #112
+; CHECK-NEXT: add sp, sp, #128
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
; CHECK-NEXT: ret
entry:
call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2)
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index 465fb466..4321493 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
; This file tests the following combinations related to streaming-enabled functions:
; [ ] N -> S (Normal -> Streaming)
@@ -22,10 +22,11 @@ define void @normal_caller_streaming_callee() nounwind {
; CHECK-LABEL: normal_caller_streaming_callee:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl streaming_callee
; CHECK-NEXT: smstop sm
@@ -47,10 +48,11 @@ define void @streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_enable
; CHECK-LABEL: streaming_caller_normal_callee:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl normal_callee
; CHECK-NEXT: smstart sm
@@ -103,10 +105,11 @@ define void @call_to_function_pointer_streaming_enabled(ptr %p) nounwind {
; CHECK-LABEL: call_to_function_pointer_streaming_enabled:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: blr x0
; CHECK-NEXT: smstop sm
@@ -125,19 +128,20 @@ define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind {
; CHECK-LABEL: smstart_clobber_simdfp:
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl streaming_callee
; CHECK-NEXT: smstop sm
-; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #96
@@ -150,7 +154,9 @@ define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind {
define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
; CHECK-LABEL: smstart_clobber_sve:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-18
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -216,7 +222,7 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @streaming_callee()
ret <vscale x 4 x i32> %x;
@@ -227,7 +233,9 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x) nounwind {
; CHECK-LABEL: smstart_clobber_sve_duplicate:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-18
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -296,7 +304,7 @@ define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x)
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @streaming_callee()
call void @streaming_callee()
@@ -308,11 +316,12 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
; CHECK-LABEL: call_to_intrinsic_without_chain:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: stp d0, d0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr d0, [sp] // 8-byte Folded Reload
@@ -320,11 +329,11 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
; CHECK-NEXT: str d0, [sp] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: ldp d1, d0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: fadd d0, d1, d0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: fadd d0, d1, d0
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #96
; CHECK-NEXT: ret
@@ -342,10 +351,11 @@ define void @disable_tailcallopt() nounwind {
; CHECK-LABEL: disable_tailcallopt:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl streaming_callee
; CHECK-NEXT: smstop sm
@@ -362,11 +372,13 @@ define void @disable_tailcallopt() nounwind {
define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone %ptr) #0 {
; CHECK-LABEL: call_to_non_streaming_pass_sve_objects:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: rdsvl x3, #1
; CHECK-NEXT: addvl x0, sp, #2
@@ -383,7 +395,7 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
entry:
%Data1 = alloca <vscale x 16 x i8>, align 16
@@ -400,11 +412,12 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
; CHECK-LABEL: call_to_non_streaming_pass_args:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill
; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
index 45ca784..ac19bd5 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
@@ -15,11 +15,12 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
; CHECK-LABEL: test_no_stackslot_scavenging:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x24, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x24, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
@@ -31,8 +32,8 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
; CHECK-NEXT: smstart sm
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x24, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x24, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -46,4 +47,4 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
declare void @use_f(float)
-attributes #0 = { nounwind "target-features"="+sme" "aarch64_pstate_sm_enabled" }
+attributes #0 = { nounwind "target-features"="+sve,+sme" "aarch64_pstate_sm_enabled" }
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
new file mode 100644
index 0000000..6264ce0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -0,0 +1,1177 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=FP-CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=NO-SVE-CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs -enable-machine-outliner < %s | FileCheck %s --check-prefix=OUTLINER-CHECK
+
+declare void @callee();
+declare void @fixed_callee(<4 x i32>);
+declare void @scalable_callee(<vscale x 2 x i64>);
+
+declare void @streaming_callee() #0;
+declare void @streaming_callee_with_arg(i32) #0;
+
+; Simple example of a function with one call requiring a streaming mode change
+;
+define void @vg_unwind_simple() #0 {
+; CHECK-LABEL: vg_unwind_simple:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 80
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: .cfi_offset b8, -24
+; CHECK-NEXT: .cfi_offset b9, -32
+; CHECK-NEXT: .cfi_offset b10, -40
+; CHECK-NEXT: .cfi_offset b11, -48
+; CHECK-NEXT: .cfi_offset b12, -56
+; CHECK-NEXT: .cfi_offset b13, -64
+; CHECK-NEXT: .cfi_offset b14, -72
+; CHECK-NEXT: .cfi_offset b15, -80
+; CHECK-NEXT: .cfi_offset vg, -8
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_simple:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: vg_unwind_simple:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void @callee();
+ ret void;
+}
+
+; As above, with an extra register clobbered by the inline asm call which
+; changes NeedsGapToAlignStack to false
+;
+define void @vg_unwind_needs_gap() #0 {
+; CHECK-LABEL: vg_unwind_needs_gap:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x20, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x20, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w20
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_needs_gap:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x9, x20, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w20, -8
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: //APP
+; FP-CHECK-NEXT: //NO_APP
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr x20, [sp, #88] // 8-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w20
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: vg_unwind_needs_gap:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void asm sideeffect "", "~{x20}"()
+ call void @callee();
+ ret void;
+}
+
+define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
+; CHECK-LABEL: vg_unwind_with_fixed_args:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: .cfi_offset b8, -24
+; CHECK-NEXT: .cfi_offset b9, -32
+; CHECK-NEXT: .cfi_offset b10, -40
+; CHECK-NEXT: .cfi_offset b11, -48
+; CHECK-NEXT: .cfi_offset b12, -56
+; CHECK-NEXT: .cfi_offset b13, -64
+; CHECK-NEXT: .cfi_offset b14, -72
+; CHECK-NEXT: .cfi_offset b15, -80
+; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_offset vg, -8
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: bl fixed_callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_with_fixed_args:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: sub sp, sp, #112
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 112
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #80
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; FP-CHECK-NEXT: bl fixed_callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 112
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #80] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: add sp, sp, #112
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: vg_unwind_with_fixed_args:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void @fixed_callee(<4 x i32> %x);
+ ret void;
+}
+
+define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
+; CHECK-LABEL: vg_unwind_with_sve_args:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w28, -8
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: addvl sp, sp, #-18
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
+; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 32 - 32 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 32 - 40 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 32 - 48 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 32 - 56 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 32 - 64 * VG
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 152 * VG
+; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: .cfi_offset vg, -16
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: bl scalable_callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #18
+; CHECK-NEXT: .cfi_def_cfa wsp, 32
+; CHECK-NEXT: .cfi_restore z8
+; CHECK-NEXT: .cfi_restore z9
+; CHECK-NEXT: .cfi_restore z10
+; CHECK-NEXT: .cfi_restore z11
+; CHECK-NEXT: .cfi_restore z12
+; CHECK-NEXT: .cfi_restore z13
+; CHECK-NEXT: .cfi_restore z14
+; CHECK-NEXT: .cfi_restore z15
+; CHECK-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w28
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_with_sve_args:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 48
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; FP-CHECK-NEXT: mov x29, sp
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 48
+; FP-CHECK-NEXT: .cfi_offset w27, -8
+; FP-CHECK-NEXT: .cfi_offset w28, -16
+; FP-CHECK-NEXT: .cfi_offset w30, -40
+; FP-CHECK-NEXT: .cfi_offset w29, -48
+; FP-CHECK-NEXT: addvl sp, sp, #-18
+; FP-CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: ptrue pn8.b
+; FP-CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 48 - 32 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 48 - 40 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 48 - 48 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG
+; FP-CHECK-NEXT: addvl sp, sp, #-1
+; FP-CHECK-NEXT: str z0, [x29, #-19, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: //APP
+; FP-CHECK-NEXT: //NO_APP
+; FP-CHECK-NEXT: .cfi_offset vg, -32
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: ldr z0, [x29, #-19, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: bl scalable_callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: addvl sp, sp, #1
+; FP-CHECK-NEXT: ptrue pn8.b
+; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: addvl sp, sp, #18
+; FP-CHECK-NEXT: .cfi_restore z8
+; FP-CHECK-NEXT: .cfi_restore z9
+; FP-CHECK-NEXT: .cfi_restore z10
+; FP-CHECK-NEXT: .cfi_restore z11
+; FP-CHECK-NEXT: .cfi_restore z12
+; FP-CHECK-NEXT: .cfi_restore z13
+; FP-CHECK-NEXT: .cfi_restore z14
+; FP-CHECK-NEXT: .cfi_restore z15
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 48
+; FP-CHECK-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w27
+; FP-CHECK-NEXT: .cfi_restore w28
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: vg_unwind_with_sve_args:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void asm sideeffect "", "~{x28}"()
+ call void @scalable_callee(<vscale x 2 x i64> %x);
+ ret void;
+}
+
+; This test was based on stack-probing-64k.ll and tries to test multiple uses of
+; findScratchNonCalleeSaveRegister.
+;
+define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
+; CHECK-LABEL: vg_unwind_multiple_scratch_regs:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680
+; CHECK-NEXT: .cfi_def_cfa w9, 327776
+; CHECK-NEXT: .LBB4_1: // %entry
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT: cmp sp, x9
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: b.ne .LBB4_1
+; CHECK-NEXT: // %bb.2: // %entry
+; CHECK-NEXT: .cfi_def_cfa_register wsp
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: str x8, [x0]
+; CHECK-NEXT: .cfi_offset vg, -16
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_multiple_scratch_regs:
+; FP-CHECK: // %bb.0: // %entry
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w28, -8
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680
+; FP-CHECK-NEXT: .LBB4_1: // %entry
+; FP-CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; FP-CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
+; FP-CHECK-NEXT: cmp sp, x9
+; FP-CHECK-NEXT: str xzr, [sp]
+; FP-CHECK-NEXT: b.ne .LBB4_1
+; FP-CHECK-NEXT: // %bb.2: // %entry
+; FP-CHECK-NEXT: mov x8, sp
+; FP-CHECK-NEXT: str x8, [x0]
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w28
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: vg_unwind_multiple_scratch_regs:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+entry:
+ %v = alloca i8, i64 327680, align 1
+ store ptr %v, ptr %out, align 8
+ call void @callee()
+ ret void
+}
+
+; Locally streaming functions require storing both the streaming and
+; non-streaming values of VG.
+;
+define void @vg_locally_streaming_fn() #3 {
+; CHECK-LABEL: vg_locally_streaming_fn:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset vg, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: bl streaming_callee
+; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_locally_streaming_fn:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: rdsvl x9, #1
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: lsr x9, x9, #3
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset vg, -8
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: bl streaming_callee
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: vg_locally_streaming_fn:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void @callee()
+ call void @streaming_callee()
+ call void @callee()
+ ret void
+}
+
+define void @streaming_compatible_to_streaming() #4 {
+; CHECK-LABEL: streaming_compatible_to_streaming:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: tbnz w19, #0, .LBB6_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB6_2:
+; CHECK-NEXT: bl streaming_callee
+; CHECK-NEXT: tbnz w19, #0, .LBB6_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB6_4:
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: streaming_compatible_to_streaming:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w19, -8
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: bl __arm_sme_state
+; FP-CHECK-NEXT: and x19, x0, #0x1
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: tbnz w19, #0, .LBB6_2
+; FP-CHECK-NEXT: // %bb.1:
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .LBB6_2:
+; FP-CHECK-NEXT: bl streaming_callee
+; FP-CHECK-NEXT: tbnz w19, #0, .LBB6_4
+; FP-CHECK-NEXT: // %bb.3:
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: .LBB6_4:
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w19
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: streaming_compatible_to_streaming:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void @streaming_callee()
+ ret void
+}
+
+define void @streaming_compatible_to_non_streaming() #4 {
+; CHECK-LABEL: streaming_compatible_to_non_streaming:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: tbz w19, #0, .LBB7_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB7_2:
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: tbz w19, #0, .LBB7_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB7_4:
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: streaming_compatible_to_non_streaming:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w19, -8
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: bl __arm_sme_state
+; FP-CHECK-NEXT: and x19, x0, #0x1
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: tbz w19, #0, .LBB7_2
+; FP-CHECK-NEXT: // %bb.1:
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: .LBB7_2:
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: tbz w19, #0, .LBB7_4
+; FP-CHECK-NEXT: // %bb.3:
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .LBB7_4:
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w19
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: streaming_compatible_to_non_streaming:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void @callee()
+ ret void
+}
+
+; If the target does not have SVE, do not emit cntd in the prologue and
+; instead spill the result returned by __arm_get_current_vg.
+; This requires preserving the argument %x as the vg value is returned
+; in X0.
+;
+define void @streaming_compatible_no_sve(i32 noundef %x) #4 {
+; NO-SVE-CHECK-LABEL: streaming_compatible_no_sve:
+; NO-SVE-CHECK: // %bb.0:
+; NO-SVE-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: .cfi_def_cfa_offset 96
+; NO-SVE-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: mov x9, x0
+; NO-SVE-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: bl __arm_get_current_vg
+; NO-SVE-CHECK-NEXT: stp x0, x19, [sp, #80] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: mov x0, x9
+; NO-SVE-CHECK-NEXT: add x29, sp, #64
+; NO-SVE-CHECK-NEXT: .cfi_def_cfa w29, 32
+; NO-SVE-CHECK-NEXT: .cfi_offset w19, -8
+; NO-SVE-CHECK-NEXT: .cfi_offset w30, -24
+; NO-SVE-CHECK-NEXT: .cfi_offset w29, -32
+; NO-SVE-CHECK-NEXT: .cfi_offset b8, -40
+; NO-SVE-CHECK-NEXT: .cfi_offset b9, -48
+; NO-SVE-CHECK-NEXT: .cfi_offset b10, -56
+; NO-SVE-CHECK-NEXT: .cfi_offset b11, -64
+; NO-SVE-CHECK-NEXT: .cfi_offset b12, -72
+; NO-SVE-CHECK-NEXT: .cfi_offset b13, -80
+; NO-SVE-CHECK-NEXT: .cfi_offset b14, -88
+; NO-SVE-CHECK-NEXT: .cfi_offset b15, -96
+; NO-SVE-CHECK-NEXT: mov w8, w0
+; NO-SVE-CHECK-NEXT: bl __arm_sme_state
+; NO-SVE-CHECK-NEXT: and x19, x0, #0x1
+; NO-SVE-CHECK-NEXT: .cfi_offset vg, -16
+; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB8_2
+; NO-SVE-CHECK-NEXT: // %bb.1:
+; NO-SVE-CHECK-NEXT: smstart sm
+; NO-SVE-CHECK-NEXT: .LBB8_2:
+; NO-SVE-CHECK-NEXT: mov w0, w8
+; NO-SVE-CHECK-NEXT: bl streaming_callee_with_arg
+; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB8_4
+; NO-SVE-CHECK-NEXT: // %bb.3:
+; NO-SVE-CHECK-NEXT: smstop sm
+; NO-SVE-CHECK-NEXT: .LBB8_4:
+; NO-SVE-CHECK-NEXT: .cfi_restore vg
+; NO-SVE-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; NO-SVE-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
+; NO-SVE-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT: .cfi_def_cfa_offset 0
+; NO-SVE-CHECK-NEXT: .cfi_restore w19
+; NO-SVE-CHECK-NEXT: .cfi_restore w30
+; NO-SVE-CHECK-NEXT: .cfi_restore w29
+; NO-SVE-CHECK-NEXT: .cfi_restore b8
+; NO-SVE-CHECK-NEXT: .cfi_restore b9
+; NO-SVE-CHECK-NEXT: .cfi_restore b10
+; NO-SVE-CHECK-NEXT: .cfi_restore b11
+; NO-SVE-CHECK-NEXT: .cfi_restore b12
+; NO-SVE-CHECK-NEXT: .cfi_restore b13
+; NO-SVE-CHECK-NEXT: .cfi_restore b14
+; NO-SVE-CHECK-NEXT: .cfi_restore b15
+; NO-SVE-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: streaming_compatible_no_sve:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void @streaming_callee_with_arg(i32 %x)
+ ret void
+}
+
+; Ensure we still emit async unwind information with -fno-asynchronous-unwind-tables
+; if the function contains a streaming-mode change.
+
+define void @vg_unwind_noasync() #5 {
+; CHECK-LABEL: vg_unwind_noasync:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 80
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: .cfi_offset b8, -24
+; CHECK-NEXT: .cfi_offset b9, -32
+; CHECK-NEXT: .cfi_offset b10, -40
+; CHECK-NEXT: .cfi_offset b11, -48
+; CHECK-NEXT: .cfi_offset b12, -56
+; CHECK-NEXT: .cfi_offset b13, -64
+; CHECK-NEXT: .cfi_offset b14, -72
+; CHECK-NEXT: .cfi_offset b15, -80
+; CHECK-NEXT: .cfi_offset vg, -8
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_noasync:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+; OUTLINER-CHECK-LABEL: vg_unwind_noasync:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void @callee();
+ ret void;
+}
+
+attributes #0 = { "aarch64_pstate_sm_enabled" uwtable(async) }
+attributes #1 = { "probe-stack"="inline-asm" "aarch64_pstate_sm_enabled" uwtable(async) }
+attributes #3 = { "aarch64_pstate_sm_body" uwtable(async) }
+attributes #4 = { "aarch64_pstate_sm_compatible" uwtable(async) }
+attributes #5 = { "aarch64_pstate_sm_enabled" }
diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
index c39894c..106d619 100644
--- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
+++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -aarch64-lower-to-sme-routines=false < %s | FileCheck %s -check-prefixes=CHECK-NO-SME-ROUTINES
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+mops -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-MOPS
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs -aarch64-lower-to-sme-routines=false < %s | FileCheck %s -check-prefixes=CHECK-NO-SME-ROUTINES
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -mattr=+mops -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-MOPS
@dst = global [512 x i8] zeroinitializer, align 1
@src = global [512 x i8] zeroinitializer, align 1
@@ -22,13 +22,14 @@ define void @se_memcpy(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
; CHECK-NO-SME-ROUTINES-LABEL: se_memcpy:
; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
-; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src]
; CHECK-NO-SME-ROUTINES-NEXT: smstop sm
@@ -71,12 +72,13 @@ define void @se_memset(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
; CHECK-NO-SME-ROUTINES-LABEL: se_memset:
; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
+; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: smstop sm
; CHECK-NO-SME-ROUTINES-NEXT: mov w1, #2 // =0x2
@@ -119,13 +121,14 @@ define void @se_memmove(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
; CHECK-NO-SME-ROUTINES-LABEL: se_memmove:
; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
-; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src]
; CHECK-NO-SME-ROUTINES-NEXT: smstop sm
@@ -168,16 +171,18 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
;
; CHECK-NO-SME-ROUTINES-LABEL: sc_memcpy:
; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry
-; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: bl __arm_sme_state
; CHECK-NO-SME-ROUTINES-NEXT: adrp x8, :got:dst
-; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
; CHECK-NO-SME-ROUTINES-NEXT: and x19, x0, #0x1
+; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
; CHECK-NO-SME-ROUTINES-NEXT: ldr x8, [x8, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src]
; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB3_2
@@ -190,11 +195,12 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-NO-SME-ROUTINES-NEXT: // %bb.3: // %entry
; CHECK-NO-SME-ROUTINES-NEXT: smstart sm
; CHECK-NO-SME-ROUTINES-NEXT: .LBB3_4: // %entry
-; CHECK-NO-SME-ROUTINES-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ret
;
; CHECK-MOPS-LABEL: sc_memcpy:
@@ -215,12 +221,16 @@ entry:
define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: sb_memcpy:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x2, x0
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: adrp x0, :got:dst
; CHECK-NEXT: adrp x1, :got:src
@@ -232,17 +242,21 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
;
; CHECK-NO-SME-ROUTINES-LABEL: sb_memcpy:
; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry
-; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: rdsvl x9, #1
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT: lsr x9, x9, #3
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
+; CHECK-NO-SME-ROUTINES-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: smstart sm
; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
@@ -256,15 +270,20 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
; CHECK-NO-SME-ROUTINES-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ret
;
; CHECK-MOPS-LABEL: sb_memcpy:
; CHECK-MOPS: // %bb.0: // %entry
-; CHECK-MOPS-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-MOPS-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT: rdsvl x9, #1
+; CHECK-MOPS-NEXT: lsr x9, x9, #3
+; CHECK-MOPS-NEXT: str x9, [sp, #-80]! // 8-byte Folded Spill
+; CHECK-MOPS-NEXT: cntd x9
+; CHECK-MOPS-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
+; CHECK-MOPS-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-MOPS-NEXT: smstart sm
; CHECK-MOPS-NEXT: adrp x8, :got:src
; CHECK-MOPS-NEXT: adrp x9, :got:dst
@@ -274,10 +293,11 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
; CHECK-MOPS-NEXT: cpyfm [x9]!, [x8]!, x0!
; CHECK-MOPS-NEXT: cpyfe [x9]!, [x8]!, x0!
; CHECK-MOPS-NEXT: smstop sm
-; CHECK-MOPS-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-MOPS-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT: add sp, sp, #80
; CHECK-MOPS-NEXT: ret
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)