aboutsummaryrefslogtreecommitdiff
path: root/llvm
diff options
context:
space:
mode:
authorKerry McLaughlin <kerry.mclaughlin@arm.com>2024-06-13 17:42:11 +0100
committerGitHub <noreply@github.com>2024-06-13 17:42:11 +0100
commit93c8e0f2eb347e219077ddd4428faf9a89fff480 (patch)
treeef50b48fea2db4c96549d82d68be5154dab79a93 /llvm
parent46080abe9b136821eda2a1a27d8a13ceac349f8c (diff)
downloadllvm-93c8e0f2eb347e219077ddd4428faf9a89fff480.zip
llvm-93c8e0f2eb347e219077ddd4428faf9a89fff480.tar.gz
llvm-93c8e0f2eb347e219077ddd4428faf9a89fff480.tar.bz2
[AArch64][SME] Save VG for unwind info when changing streaming-mode (#83301)
If a function requires any streaming-mode change, the vector granule value must be stored to the stack and unwind info must also describe the save of VG to this location. This patch adds VG to the list of callee-saved registers and increases the callee-saved stack size if the function requires streaming-mode changes. A new type is added to RegPairInfo, which is also used to skip restoring the register used to spill the VG value in the epilogue. See https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp246
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp12
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h3
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp8
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h11
-rw-r--r--llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td15
-rw-r--r--llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll12
-rw-r--r--llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll22
-rw-r--r--llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll79
-rw-r--r--llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll54
-rw-r--r--llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll518
-rw-r--r--llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll43
-rw-r--r--llvm/test/CodeGen/AArch64/sme-streaming-body.ll152
-rw-r--r--llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll131
-rw-r--r--llvm/test/CodeGen/AArch64/sme-streaming-interface.ll49
-rw-r--r--llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll11
-rw-r--r--llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll1177
-rw-r--r--llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll76
18 files changed, 2167 insertions, 452 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index cf617c7..a991813 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -321,7 +321,7 @@ bool AArch64FrameLowering::homogeneousPrologEpilog(
return false;
auto *AFI = MF.getInfo<AArch64FunctionInfo>();
- if (AFI->hasSwiftAsyncContext())
+ if (AFI->hasSwiftAsyncContext() || AFI->hasStreamingModeChanges())
return false;
// If there are an odd number of GPRs before LR and FP in the CSRs list,
@@ -558,6 +558,10 @@ void AArch64FrameLowering::emitCalleeSavedGPRLocations(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ SMEAttrs Attrs(MF.getFunction());
+ bool LocallyStreaming =
+ Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface();
const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
if (CSI.empty())
@@ -569,14 +573,22 @@ void AArch64FrameLowering::emitCalleeSavedGPRLocations(
DebugLoc DL = MBB.findDebugLoc(MBBI);
for (const auto &Info : CSI) {
- if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)
+ unsigned FrameIdx = Info.getFrameIdx();
+ if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector)
continue;
assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
- unsigned DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true);
+ int64_t DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true);
+ int64_t Offset = MFI.getObjectOffset(FrameIdx) - getOffsetOfLocalArea();
+
+ // The location of VG will be emitted before each streaming-mode change in
+ // the function. Only locally-streaming functions require emitting the
+ // non-streaming VG location here.
+ if ((LocallyStreaming && FrameIdx == AFI->getStreamingVGIdx()) ||
+ (!LocallyStreaming &&
+ DwarfReg == TRI.getDwarfRegNum(AArch64::VG, true)))
+ continue;
- int64_t Offset =
- MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -699,6 +711,9 @@ static void emitCalleeSavedRestores(MachineBasicBlock &MBB,
!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
continue;
+ if (!Info.isRestored())
+ continue;
+
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
nullptr, TRI.getDwarfRegNum(Info.getReg(), true)));
BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -1342,6 +1357,32 @@ static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
}
+bool requiresGetVGCall(MachineFunction &MF) {
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ return AFI->hasStreamingModeChanges() &&
+ !MF.getSubtarget<AArch64Subtarget>().hasSVE();
+}
+
+bool isVGInstruction(MachineBasicBlock::iterator MBBI) {
+ unsigned Opc = MBBI->getOpcode();
+ if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI ||
+ Opc == AArch64::UBFMXri)
+ return true;
+
+ if (requiresGetVGCall(*MBBI->getMF())) {
+ if (Opc == AArch64::ORRXrr)
+ return true;
+
+ if (Opc == AArch64::BL) {
+ auto Op1 = MBBI->getOperand(0);
+ return Op1.isSymbol() &&
+ (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg");
+ }
+ }
+
+ return false;
+}
+
// Convert callee-save register save/restore instruction to do stack pointer
// decrement/increment to allocate/deallocate the callee-save stack area by
// converting store/load to use pre/post increment version.
@@ -1352,6 +1393,17 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
int CFAOffset = 0) {
unsigned NewOpc;
+
+ // If the function contains streaming mode changes, we expect instructions
+ // to calculate the value of VG before spilling. For locally-streaming
+ // functions, we need to do this for both the streaming and non-streaming
+ // vector length. Move past these instructions if necessary.
+ MachineFunction &MF = *MBB.getParent();
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ if (AFI->hasStreamingModeChanges())
+ while (isVGInstruction(MBBI))
+ ++MBBI;
+
switch (MBBI->getOpcode()) {
default:
llvm_unreachable("Unexpected callee-save save/restore opcode!");
@@ -1408,7 +1460,6 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
// If the first store isn't right where we want SP then we can't fold the
// update in so create a normal arithmetic instruction instead.
- MachineFunction &MF = *MBB.getParent();
if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
@@ -1660,6 +1711,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
LiveRegs.removeReg(AArch64::X19);
LiveRegs.removeReg(AArch64::FP);
LiveRegs.removeReg(AArch64::LR);
+
+ // X0 will be clobbered by a call to __arm_get_current_vg in the prologue.
+ // This is necessary to spill VG if required where SVE is unavailable, but
+ // X0 is preserved around this call.
+ if (requiresGetVGCall(MF))
+ LiveRegs.removeReg(AArch64::X0);
}
auto VerifyClobberOnExit = make_scope_exit([&]() {
@@ -1846,6 +1903,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// pointer bump above.
while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
!IsSVECalleeSave(MBBI)) {
+ // Move past instructions generated to calculate VG
+ if (AFI->hasStreamingModeChanges())
+ while (isVGInstruction(MBBI))
+ ++MBBI;
+
if (CombineSPBump)
fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
NeedsWinCFI, &HasWinCFI);
@@ -2768,7 +2830,7 @@ struct RegPairInfo {
unsigned Reg2 = AArch64::NoRegister;
int FrameIdx;
int Offset;
- enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
+ enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type;
RegPairInfo() = default;
@@ -2780,6 +2842,7 @@ struct RegPairInfo {
return 2;
case GPR:
case FPR64:
+ case VG:
return 8;
case ZPR:
case FPR128:
@@ -2855,6 +2918,8 @@ static void computeCalleeSaveRegisterPairs(
RPI.Type = RegPairInfo::ZPR;
else if (AArch64::PPRRegClass.contains(RPI.Reg1))
RPI.Type = RegPairInfo::PPR;
+ else if (RPI.Reg1 == AArch64::VG)
+ RPI.Type = RegPairInfo::VG;
else
llvm_unreachable("Unsupported register class.");
@@ -2887,6 +2952,8 @@ static void computeCalleeSaveRegisterPairs(
if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1))
RPI.Reg2 = NextReg;
break;
+ case RegPairInfo::VG:
+ break;
}
}
@@ -3003,6 +3070,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
bool NeedsWinCFI = needsWinCFI(MF);
DebugLoc DL;
SmallVector<RegPairInfo, 8> RegPairs;
@@ -3070,7 +3138,70 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
Size = 2;
Alignment = Align(2);
break;
+ case RegPairInfo::VG:
+ StrOpc = AArch64::STRXui;
+ Size = 8;
+ Alignment = Align(8);
+ break;
}
+
+ unsigned X0Scratch = AArch64::NoRegister;
+ if (Reg1 == AArch64::VG) {
+ // Find an available register to store value of VG to.
+ Reg1 = findScratchNonCalleeSaveRegister(&MBB);
+ assert(Reg1 != AArch64::NoRegister);
+ SMEAttrs Attrs(MF.getFunction());
+
+ if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface() &&
+ AFI->getStreamingVGIdx() == std::numeric_limits<int>::max()) {
+ // For locally-streaming functions, we need to store both the streaming
+ // & non-streaming VG. Spill the streaming value first.
+ BuildMI(MBB, MI, DL, TII.get(AArch64::RDSVLI_XI), Reg1)
+ .addImm(1)
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MI, DL, TII.get(AArch64::UBFMXri), Reg1)
+ .addReg(Reg1)
+ .addImm(3)
+ .addImm(63)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ AFI->setStreamingVGIdx(RPI.FrameIdx);
+ } else if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
+ BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1)
+ .addImm(31)
+ .addImm(1)
+ .setMIFlag(MachineInstr::FrameSetup);
+ AFI->setVGIdx(RPI.FrameIdx);
+ } else {
+ const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+ if (llvm::any_of(
+ MBB.liveins(),
+ [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
+ return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
+ AArch64::X0, LiveIn.PhysReg);
+ }))
+ X0Scratch = Reg1;
+
+ if (X0Scratch != AArch64::NoRegister)
+ BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), Reg1)
+ .addReg(AArch64::XZR)
+ .addReg(AArch64::X0, RegState::Undef)
+ .addReg(AArch64::X0, RegState::Implicit)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ const uint32_t *RegMask = TRI->getCallPreservedMask(
+ MF,
+ CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1);
+ BuildMI(MBB, MI, DL, TII.get(AArch64::BL))
+ .addExternalSymbol("__arm_get_current_vg")
+ .addRegMask(RegMask)
+ .addReg(AArch64::X0, RegState::ImplicitDefine)
+ .setMIFlag(MachineInstr::FrameSetup);
+ Reg1 = AArch64::X0;
+ AFI->setVGIdx(RPI.FrameIdx);
+ }
+ }
+
LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
dbgs() << ") -> fi#(" << RPI.FrameIdx;
@@ -3162,6 +3293,13 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
if (RPI.isPaired())
MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
}
+
+ if (X0Scratch != AArch64::NoRegister)
+ BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), AArch64::X0)
+ .addReg(AArch64::XZR)
+ .addReg(X0Scratch, RegState::Undef)
+ .addReg(X0Scratch, RegState::Implicit)
+ .setMIFlag(MachineInstr::FrameSetup);
}
return true;
}
@@ -3241,6 +3379,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
Size = 2;
Alignment = Align(2);
break;
+ case RegPairInfo::VG:
+ continue;
}
LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
@@ -3440,6 +3580,19 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
CSStackSize += RegSize;
}
+ // Increase the callee-saved stack size if the function has streaming mode
+ // changes, as we will need to spill the value of the VG register.
+ // For locally streaming functions, we spill both the streaming and
+ // non-streaming VG value.
+ const Function &F = MF.getFunction();
+ SMEAttrs Attrs(F);
+ if (AFI->hasStreamingModeChanges()) {
+ if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
+ CSStackSize += 16;
+ else
+ CSStackSize += 8;
+ }
+
// Save number of saved regs, so we can easily update CSStackSize later.
unsigned NumSavedRegs = SavedRegs.count();
@@ -3576,6 +3729,33 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
}
+ // Insert VG into the list of CSRs, immediately before LR if saved.
+ if (AFI->hasStreamingModeChanges()) {
+ std::vector<CalleeSavedInfo> VGSaves;
+ SMEAttrs Attrs(MF.getFunction());
+
+ auto VGInfo = CalleeSavedInfo(AArch64::VG);
+ VGInfo.setRestored(false);
+ VGSaves.push_back(VGInfo);
+
+ // Add VG again if the function is locally-streaming, as we will spill two
+ // values.
+ if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
+ VGSaves.push_back(VGInfo);
+
+ bool InsertBeforeLR = false;
+
+ for (unsigned I = 0; I < CSI.size(); I++)
+ if (CSI[I].getReg() == AArch64::LR) {
+ InsertBeforeLR = true;
+ CSI.insert(CSI.begin() + I, VGSaves.begin(), VGSaves.end());
+ break;
+ }
+
+ if (!InsertBeforeLR)
+ CSI.insert(CSI.end(), VGSaves.begin(), VGSaves.end());
+ }
+
for (auto &CS : CSI) {
Register Reg = CS.getReg();
const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
@@ -4191,12 +4371,58 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
}
} // namespace
+MachineBasicBlock::iterator emitVGSaveRestore(MachineBasicBlock::iterator II,
+ const AArch64FrameLowering *TFI) {
+ MachineInstr &MI = *II;
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineFunction *MF = MBB->getParent();
+
+ if (MI.getOpcode() != AArch64::VGSavePseudo &&
+ MI.getOpcode() != AArch64::VGRestorePseudo)
+ return II;
+
+ SMEAttrs FuncAttrs(MF->getFunction());
+ bool LocallyStreaming =
+ FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface();
+ const AArch64FunctionInfo *AFI = MF->getInfo<AArch64FunctionInfo>();
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+ const AArch64InstrInfo *TII =
+ MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+ int64_t VGFrameIdx =
+ LocallyStreaming ? AFI->getStreamingVGIdx() : AFI->getVGIdx();
+ assert(VGFrameIdx != std::numeric_limits<int>::max() &&
+ "Expected FrameIdx for VG");
+
+ unsigned CFIIndex;
+ if (MI.getOpcode() == AArch64::VGSavePseudo) {
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
+ int64_t Offset =
+ MFI.getObjectOffset(VGFrameIdx) - TFI->getOffsetOfLocalArea();
+ CFIIndex = MF->addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, TRI->getDwarfRegNum(AArch64::VG, true), Offset));
+ } else
+ CFIIndex = MF->addFrameInst(MCCFIInstruction::createRestore(
+ nullptr, TRI->getDwarfRegNum(AArch64::VG, true)));
+
+ MachineInstr *UnwindInst = BuildMI(*MBB, II, II->getDebugLoc(),
+ TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ MI.eraseFromParent();
+ return UnwindInst->getIterator();
+}
+
void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
MachineFunction &MF, RegScavenger *RS = nullptr) const {
- if (StackTaggingMergeSetTag)
- for (auto &BB : MF)
- for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ for (auto &BB : MF)
+ for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) {
+ if (AFI->hasStreamingModeChanges())
+ II = emitVGSaveRestore(II, this);
+ if (StackTaggingMergeSetTag)
II = tryMergeAdjacentSTG(II, this, RS);
+ }
}
/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c4f819f..af8b9d9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2493,6 +2493,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::FIRST_NUMBER:
break;
MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
+ MAKE_CASE(AArch64ISD::VG_SAVE)
+ MAKE_CASE(AArch64ISD::VG_RESTORE)
MAKE_CASE(AArch64ISD::SMSTART)
MAKE_CASE(AArch64ISD::SMSTOP)
MAKE_CASE(AArch64ISD::RESTORE_ZA)
@@ -8514,6 +8516,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
SDValue InGlue;
if (RequiresSMChange) {
+
+ Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL,
+ DAG.getVTList(MVT::Other, MVT::Glue), Chain);
+ InGlue = Chain.getValue(1);
+
SDValue NewChain = changeStreamingMode(
DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
@@ -8691,6 +8698,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Result = changeStreamingMode(
DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
+ InGlue = Result.getValue(1);
+
+ Result =
+ DAG.getNode(AArch64ISD::VG_RESTORE, DL,
+ DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue});
}
if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 48a4ea9..b57ba09 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -70,6 +70,9 @@ enum NodeType : unsigned {
COALESCER_BARRIER,
+ VG_SAVE,
+ VG_RESTORE,
+
SMSTART,
SMSTOP,
RESTORE_ZA,
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index c3d64f5..957d7bc 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -196,12 +196,14 @@ bool AArch64FunctionInfo::needsAsyncDwarfUnwindInfo(
const MachineFunction &MF) const {
if (!NeedsAsyncDwarfUnwindInfo) {
const Function &F = MF.getFunction();
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
// The check got "minsize" is because epilogue unwind info is not emitted
// (yet) for homogeneous epilogues, outlined functions, and functions
// outlined from.
- NeedsAsyncDwarfUnwindInfo = needsDwarfUnwindInfo(MF) &&
- F.getUWTableKind() == UWTableKind::Async &&
- !F.hasMinSize();
+ NeedsAsyncDwarfUnwindInfo =
+ needsDwarfUnwindInfo(MF) &&
+ ((F.getUWTableKind() == UWTableKind::Async && !F.hasMinSize()) ||
+ AFI->hasStreamingModeChanges());
}
return *NeedsAsyncDwarfUnwindInfo;
}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index df09fc5..839a3a3 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -13,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
+#include "AArch64Subtarget.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
@@ -216,6 +217,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// The PTRUE is used for the LD/ST of ZReg pairs in save and restore.
unsigned PredicateRegForFillSpill = 0;
+ // The stack slots where VG values are stored to.
+ int64_t VGIdx = std::numeric_limits<int>::max();
+ int64_t StreamingVGIdx = std::numeric_limits<int>::max();
+
public:
AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI);
@@ -234,6 +239,12 @@ public:
Register getPStateSMReg() const { return PStateSMReg; };
void setPStateSMReg(Register Reg) { PStateSMReg = Reg; };
+ int64_t getVGIdx() const { return VGIdx; };
+ void setVGIdx(unsigned Idx) { VGIdx = Idx; };
+
+ int64_t getStreamingVGIdx() const { return StreamingVGIdx; };
+ void setStreamingVGIdx(unsigned FrameIdx) { StreamingVGIdx = FrameIdx; };
+
bool isSVECC() const { return IsSVECC; };
void setIsSVECC(bool s) { IsSVECC = s; };
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 2b70c47..fea70b7 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -31,6 +31,12 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
def AArch64CoalescerBarrier
: SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
+ [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>,
+ [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
+
//===----------------------------------------------------------------------===//
// Instruction naming conventions.
//===----------------------------------------------------------------------===//
@@ -221,6 +227,15 @@ def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 /*AArch64SME::Always*/0)),
(MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>;
+// Pseudo to insert cfi_offset/cfi_restore instructions. Used to save or restore
+// the streaming value of VG around streaming-mode changes in locally-streaming
+// functions.
+def VGSavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+def : Pat<(AArch64VGSave), (VGSavePseudo)>;
+
+def VGRestorePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+def : Pat<(AArch64VGRestore), (VGRestorePseudo)>;
+
//===----------------------------------------------------------------------===//
// SME2 Instructions
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll b/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll
index 44d47a0..aae1a66 100644
--- a/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll
+++ b/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll
@@ -7,10 +7,11 @@ define void @streaming_mode_change1() #0 {
; CHECK-LABEL: streaming_mode_change1:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl callee
; CHECK-NEXT: smstart sm
@@ -23,6 +24,7 @@ define void @streaming_mode_change1() #0 {
;
; OUTLINER-LABEL: streaming_mode_change1:
; OUTLINER-NOT: OUTLINED_FUNCTION_
+;
call void @callee();
ret void;
}
@@ -31,10 +33,11 @@ define void @streaming_mode_change2() #0 {
; CHECK-LABEL: streaming_mode_change2:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl callee
; CHECK-NEXT: smstart sm
@@ -47,6 +50,7 @@ define void @streaming_mode_change2() #0 {
;
; OUTLINER-LABEL: streaming_mode_change2:
; OUTLINER-NOT: OUTLINED_FUNCTION_
+;
call void @callee();
ret void;
}
@@ -55,10 +59,11 @@ define void @streaming_mode_change3() #0 {
; CHECK-LABEL: streaming_mode_change3:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl callee
; CHECK-NEXT: smstart sm
@@ -71,6 +76,7 @@ define void @streaming_mode_change3() #0 {
;
; OUTLINER-LABEL: streaming_mode_change3:
; OUTLINER-NOT: OUTLINED_FUNCTION_
+;
call void @callee();
ret void;
}
diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
index 0737719..c4440e7 100644
--- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
+++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
@@ -10,11 +10,13 @@ target triple = "aarch64"
define void @streaming_compatible() #0 {
; CHECK-LABEL: streaming_compatible:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: bl __arm_get_current_vg
+; CHECK-NEXT: stp x0, x19, [sp, #72] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB0_2
@@ -26,11 +28,12 @@ define void @streaming_compatible() #0 {
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB0_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @non_streaming()
ret void
@@ -44,12 +47,14 @@ declare void @non_streaming()
define void @streaming_compatible_arg(float %f) #0 {
; CHECK-LABEL: streaming_compatible_arg:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: bl __arm_get_current_vg
+; CHECK-NEXT: stp x0, x19, [sp, #88] // 16-byte Folded Spill
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
@@ -63,12 +68,13 @@ define void @streaming_compatible_arg(float %f) #0 {
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB1_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
call void @non_streaming(float %f)
ret void
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 254e37e..d786ffd 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -fast-isel=true -global-isel=false -fast-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s \
+; RUN: llc -fast-isel=true -global-isel=false -fast-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 < %s \
; RUN: | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-FISEL
-; RUN: llc -fast-isel=false -global-isel=true -global-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s \
+; RUN: llc -fast-isel=false -global-isel=true -global-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 < %s \
; RUN: | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-GISEL
@@ -17,6 +17,8 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
; CHECK-FISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-FISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-FISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-FISEL-NEXT: cntd x9
+; CHECK-FISEL-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
; CHECK-FISEL-NEXT: str d0, [sp] // 8-byte Folded Spill
; CHECK-FISEL-NEXT: smstart sm
; CHECK-FISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload
@@ -43,6 +45,8 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
; CHECK-GISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-GISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-GISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-GISEL-NEXT: cntd x9
+; CHECK-GISEL-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
; CHECK-GISEL-NEXT: str d0, [sp] // 8-byte Folded Spill
; CHECK-GISEL-NEXT: smstart sm
; CHECK-GISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload
@@ -76,6 +80,8 @@ define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: cntd x9
+; CHECK-COMMON-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: str d0, [sp] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
; CHECK-COMMON-NEXT: ldr d0, [sp] // 8-byte Folded Reload
@@ -102,12 +108,17 @@ entry:
define double @locally_streaming_caller_normal_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_body" {
; CHECK-COMMON-LABEL: locally_streaming_caller_normal_callee:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: sub sp, sp, #112
+; CHECK-COMMON-NEXT: sub sp, sp, #128
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: rdsvl x9, #1
+; CHECK-COMMON-NEXT: lsr x9, x9, #3
+; CHECK-COMMON-NEXT: str x9, [sp, #104] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: cntd x9
+; CHECK-COMMON-NEXT: str x9, [sp, #112] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: str d0, [sp, #24] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
; CHECK-COMMON-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload
@@ -129,7 +140,7 @@ define double @locally_streaming_caller_normal_callee(double %x) nounwind noinli
; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT: add sp, sp, #112
+; CHECK-COMMON-NEXT: add sp, sp, #128
; CHECK-COMMON-NEXT: ret
%call = call double @normal_callee(double %x);
%add = fadd double %call, 4.200000e+01
@@ -166,11 +177,16 @@ define double @normal_caller_to_locally_streaming_callee(double %x) nounwind noi
define void @locally_streaming_caller_streaming_callee_ptr(ptr %p) nounwind noinline optnone "aarch64_pstate_sm_body" {
; CHECK-COMMON-LABEL: locally_streaming_caller_streaming_callee_ptr:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: rdsvl x9, #1
+; CHECK-COMMON-NEXT: lsr x9, x9, #3
+; CHECK-COMMON-NEXT: str x9, [sp, #72] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: cntd x9
+; CHECK-COMMON-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
; CHECK-COMMON-NEXT: blr x0
; CHECK-COMMON-NEXT: smstop sm
@@ -178,7 +194,7 @@ define void @locally_streaming_caller_streaming_callee_ptr(ptr %p) nounwind noin
; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
call void %p() "aarch64_pstate_sm_enabled"
ret void
@@ -192,6 +208,8 @@ define void @normal_call_to_streaming_callee_ptr(ptr %p) nounwind noinline optno
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: cntd x9
+; CHECK-COMMON-NEXT: str x9, [sp, #72] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
; CHECK-COMMON-NEXT: blr x0
; CHECK-COMMON-NEXT: smstop sm
@@ -214,7 +232,8 @@ declare double @za_shared_callee(double) "aarch64_inout_za"
define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_new_za"{
; CHECK-COMMON-LABEL: za_new_caller_to_za_shared_callee:
; CHECK-COMMON: // %bb.0: // %prelude
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
@@ -240,7 +259,8 @@ define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
; CHECK-COMMON-NEXT: fadd d0, d0, d1
; CHECK-COMMON-NEXT: smstop za
; CHECK-COMMON-NEXT: mov sp, x29
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
entry:
%call = call double @za_shared_callee(double %x)
@@ -251,7 +271,8 @@ entry:
define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline optnone "aarch64_inout_za"{
; CHECK-COMMON-LABEL: za_shared_caller_to_za_none_callee:
; CHECK-COMMON: // %bb.0: // %entry
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
@@ -279,7 +300,8 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
; CHECK-COMMON-NEXT: fmov d1, x8
; CHECK-COMMON-NEXT: fadd d0, d0, d1
; CHECK-COMMON-NEXT: mov sp, x29
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
entry:
%call = call double @normal_callee(double %x)
@@ -291,7 +313,8 @@ entry:
define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-LABEL: f128_call_za:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
@@ -314,7 +337,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-NEXT: .LBB8_2:
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr
; CHECK-COMMON-NEXT: mov sp, x29
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
%res = fadd fp128 %a, %b
ret fp128 %res
@@ -326,21 +350,22 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
; CHECK-COMMON-LABEL: f128_call_sm:
; CHECK-COMMON: // %bb.0:
; CHECK-COMMON-NEXT: sub sp, sp, #112
+; CHECK-COMMON-NEXT: cntd x9
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
; CHECK-COMMON-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload
; CHECK-COMMON-NEXT: bl __addtf3
; CHECK-COMMON-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
-; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: add sp, sp, #112
@@ -353,7 +378,8 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-LABEL: frem_call_za:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
@@ -376,7 +402,8 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-NEXT: .LBB10_2:
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr
; CHECK-COMMON-NEXT: mov sp, x29
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
%res = frem double %a, %b
ret double %res
@@ -387,21 +414,22 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw
; CHECK-COMMON-LABEL: frem_call_sm:
; CHECK-COMMON: // %bb.0:
; CHECK-COMMON-NEXT: sub sp, sp, #96
+; CHECK-COMMON-NEXT: cntd x9
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
; CHECK-COMMON-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: bl fmodf
; CHECK-COMMON-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
-; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
-; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: add sp, sp, #96
@@ -414,12 +442,14 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw
define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-COMMON-LABEL: frem_call_sm_compat:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: sub sp, sp, #96
+; CHECK-COMMON-NEXT: sub sp, sp, #112
+; CHECK-COMMON-NEXT: cntd x9
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: bl __arm_sme_state
; CHECK-COMMON-NEXT: and x19, x0, #0x1
@@ -434,13 +464,14 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati
; CHECK-COMMON-NEXT: // %bb.3:
; CHECK-COMMON-NEXT: smstart sm
; CHECK-COMMON-NEXT: .LBB12_4:
-; CHECK-COMMON-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT: add sp, sp, #96
+; CHECK-COMMON-NEXT: add sp, sp, #112
; CHECK-COMMON-NEXT: ret
%res = frem float %a, %b
ret float %res
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 9d635f0..b0d6e04 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64 -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve -mattr=+sme < %s | FileCheck %s
declare void @private_za_callee()
declare float @llvm.cos.f32(float)
@@ -8,7 +8,8 @@ declare float @llvm.cos.f32(float)
define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
; CHECK-LABEL: test_lazy_save_1_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
@@ -31,7 +32,8 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @private_za_callee()
ret void
@@ -41,20 +43,21 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
; CHECK-LABEL: test_lazy_save_2_callees:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: rdsvl x19, #1
+; CHECK-NEXT: rdsvl x20, #1
; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: msub x8, x19, x19, x8
+; CHECK-NEXT: msub x8, x20, x20, x8
; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: sub x20, x29, #16
+; CHECK-NEXT: sub x21, x29, #16
; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh wzr, [x29, #-6]
; CHECK-NEXT: stur x8, [x29, #-16]
-; CHECK-NEXT: sturh w19, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x20
+; CHECK-NEXT: sturh w20, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x21
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -64,8 +67,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
; CHECK-NEXT: bl __arm_tpidr2_restore
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: sturh w19, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x20
+; CHECK-NEXT: sturh w20, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x21
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -76,8 +79,9 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @private_za_callee()
call void @private_za_callee()
@@ -88,7 +92,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inout_za" {
; CHECK-LABEL: test_lazy_save_expanded_intrinsic:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
@@ -111,7 +116,8 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
; CHECK-NEXT: .LBB2_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call float @llvm.cos.f32(float %a)
ret float %res
@@ -121,13 +127,15 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za" "aarch64_pstate_sm_compatible" {
; CHECK-LABEL: test_lazy_save_and_conditional_smstart:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: add x29, sp, #64
-; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
@@ -140,13 +148,13 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: sturh w8, [x29, #-72]
; CHECK-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
-; CHECK-NEXT: tbz w19, #0, .LBB3_2
+; CHECK-NEXT: and x20, x0, #0x1
+; CHECK-NEXT: tbz w20, #0, .LBB3_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB3_2:
; CHECK-NEXT: bl private_za_callee
-; CHECK-NEXT: tbz w19, #0, .LBB3_4
+; CHECK-NEXT: tbz w20, #0, .LBB3_4
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB3_4:
@@ -159,12 +167,12 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: .LBB3_6:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: sub sp, x29, #64
+; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @private_za_callee()
ret void
diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
index 1d1bae4..500c511 100644
--- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
+++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
@@ -16,11 +16,12 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: mov x19, x1
@@ -28,12 +29,12 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl use_i8
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -49,11 +50,12 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: mov x19, x1
@@ -61,12 +63,12 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 {
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl use_i16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -82,11 +84,12 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: mov x19, x1
@@ -94,12 +97,12 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 {
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl use_i32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -115,11 +118,12 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: mov x19, x1
@@ -127,12 +131,12 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 {
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl use_i64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -148,11 +152,12 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
@@ -165,14 +170,14 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload
; CHECK-NEXT: bl use_f16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -188,11 +193,12 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
@@ -205,14 +211,14 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
; CHECK-NEXT: bl use_f32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -228,11 +234,12 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -245,14 +252,14 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_f64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -273,11 +280,12 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1i8:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -290,14 +298,14 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v16i8
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -314,11 +322,12 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1i16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -331,14 +340,14 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v8i16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -355,11 +364,12 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1i32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -372,14 +382,14 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v4i32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -396,11 +406,12 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -413,14 +424,14 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v2i64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -437,11 +448,12 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1f16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
@@ -454,14 +466,14 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload
; CHECK-NEXT: bl use_v8f16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -478,11 +490,12 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1f32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -495,14 +508,14 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v4f32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -519,11 +532,12 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -536,14 +550,14 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v2f64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -564,11 +578,12 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -581,14 +596,14 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v16i8
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -604,11 +619,12 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -621,14 +637,14 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v8i16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -644,11 +660,12 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -661,14 +678,14 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v4i32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -684,11 +701,12 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -701,14 +719,14 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v2i64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -724,11 +742,12 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -741,14 +760,14 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v8f16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -764,11 +783,12 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -781,14 +801,14 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v8bf16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -804,11 +824,12 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -821,14 +842,14 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v4f32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -844,11 +865,12 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -861,14 +883,14 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v2f64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -887,11 +909,12 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v8i1:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -900,10 +923,10 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
-; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: and z1.b, z1.b, #0x1
; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0
; CHECK-NEXT: str p0, [x8, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v8i1
@@ -913,8 +936,8 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
; CHECK-NEXT: str p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -933,23 +956,26 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
define void @dont_coalesce_res_i8(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_i8
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call i8 @get_i8()
%vec = insertelement <vscale x 16 x i8> poison, i8 %res, i32 0
@@ -960,23 +986,26 @@ define void @dont_coalesce_res_i8(ptr %ptr) #0 {
define void @dont_coalesce_res_i16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_i16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call i16 @get_i16()
%vec = insertelement <vscale x 8 x i16> poison, i16 %res, i32 0
@@ -987,23 +1016,26 @@ define void @dont_coalesce_res_i16(ptr %ptr) #0 {
define void @dont_coalesce_res_i32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_i32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call i32 @get_i32()
%vec = insertelement <vscale x 4 x i32> poison, i32 %res, i32 0
@@ -1014,23 +1046,26 @@ define void @dont_coalesce_res_i32(ptr %ptr) #0 {
define void @dont_coalesce_res_i64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_i64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call i64 @get_i64()
%vec = insertelement <vscale x 2 x i64> poison, i64 %res, i32 0
@@ -1041,27 +1076,30 @@ define void @dont_coalesce_res_i64(ptr %ptr) #0 {
define void @dont_coalesce_res_f16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_f16
; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload
+; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call half @get_f16()
%vec = insertelement <vscale x 8 x half> poison, half %res, i32 0
@@ -1072,12 +1110,14 @@ define void @dont_coalesce_res_f16(ptr %ptr) #0 {
define void @dont_coalesce_res_f32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_f32
@@ -1087,11 +1127,12 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 {
; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call float @get_f32()
%vec = insertelement <vscale x 4 x float> poison, float %res, i32 0
@@ -1102,12 +1143,14 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 {
define void @dont_coalesce_res_f64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_f64
@@ -1117,11 +1160,12 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call double @get_f64()
%vec = insertelement <vscale x 2 x double> poison, double %res, i32 0
@@ -1136,12 +1180,14 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 {
define void @dont_coalesce_res_v1i8(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1i8
@@ -1151,11 +1197,12 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x i8> @get_v1i8()
%elt = extractelement <1 x i8> %res, i32 0
@@ -1167,12 +1214,14 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 {
define void @dont_coalesce_res_v1i16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1i16
@@ -1182,11 +1231,12 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x i16> @get_v1i16()
%elt = extractelement <1 x i16> %res, i32 0
@@ -1198,12 +1248,14 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 {
define void @dont_coalesce_res_v1i32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1i32
@@ -1213,11 +1265,12 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x i32> @get_v1i32()
%elt = extractelement <1 x i32> %res, i32 0
@@ -1229,12 +1282,14 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 {
define void @dont_coalesce_res_v1i64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1i64
@@ -1244,11 +1299,12 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x i64> @get_v1i64()
%elt = extractelement <1 x i64> %res, i32 0
@@ -1260,27 +1316,30 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 {
define void @dont_coalesce_res_v1f16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1f16
; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload
+; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x half> @get_v1f16()
%elt = extractelement <1 x half> %res, i32 0
@@ -1292,12 +1351,14 @@ define void @dont_coalesce_res_v1f16(ptr %ptr) #0 {
define void @dont_coalesce_res_v1f32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1f32
@@ -1307,11 +1368,12 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x float> @get_v1f32()
%elt = extractelement <1 x float> %res, i32 0
@@ -1323,12 +1385,14 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 {
define void @dont_coalesce_res_v1f64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1f64
@@ -1338,11 +1402,12 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x double> @get_v1f64()
%elt = extractelement <1 x double> %res, i32 0
@@ -1358,27 +1423,30 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 {
define void @dont_coalesce_res_v16i8(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v16i8
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <16 x i8> @get_v16i8()
%vec = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> %res, i64 0)
@@ -1389,27 +1457,30 @@ define void @dont_coalesce_res_v16i8(ptr %ptr) #0 {
define void @dont_coalesce_res_v8i16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v8i16
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <8 x i16> @get_v8i16()
%vec = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> poison, <8 x i16> %res, i64 0)
@@ -1420,27 +1491,30 @@ define void @dont_coalesce_res_v8i16(ptr %ptr) #0 {
define void @dont_coalesce_res_v4i32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v4i32
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <4 x i32> @get_v4i32()
%vec = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> %res, i64 0)
@@ -1451,27 +1525,30 @@ define void @dont_coalesce_res_v4i32(ptr %ptr) #0 {
define void @dont_coalesce_res_v2i64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v2i64
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <2 x i64> @get_v2i64()
%vec = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> %res, i64 0)
@@ -1482,27 +1559,30 @@ define void @dont_coalesce_res_v2i64(ptr %ptr) #0 {
define void @dont_coalesce_res_v8f16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v8f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v8f16
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <8 x half> @get_v8f16()
%vec = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %res, i64 0)
@@ -1513,27 +1593,30 @@ define void @dont_coalesce_res_v8f16(ptr %ptr) #0 {
define void @dont_coalesce_res_v4f32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v4f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v4f32
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <4 x float> @get_v4f32()
%vec = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> %res, i64 0)
@@ -1544,27 +1627,30 @@ define void @dont_coalesce_res_v4f32(ptr %ptr) #0 {
define void @dont_coalesce_res_v2f64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v2f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v2f64
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <2 x double> @get_v2f64()
%vec = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> poison, <2 x double> %res, i64 0)
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
index d675733..6c8aff5 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
declare void @normal_callee();
declare void @streaming_callee() "aarch64_pstate_sm_enabled";
@@ -8,11 +8,15 @@ declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible";
define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: sm_body_sm_compatible_simple:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x8, x0, #0x1
; CHECK-NEXT: tbnz w8, #0, .LBB0_2
@@ -28,7 +32,7 @@ define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aar
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
ret float zeroinitializer
}
@@ -36,11 +40,15 @@ define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aar
define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: sm_body_caller_sm_compatible_caller_normal_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB1_2
@@ -54,11 +62,12 @@ define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB1_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @normal_callee()
ret void
@@ -68,12 +77,16 @@ define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate
define void @streaming_body_and_streaming_compatible_interface_multi_basic_block(i32 noundef %x) "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: streaming_body_and_streaming_compatible_interface_multi_basic_block:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB2_2
@@ -87,11 +100,12 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block
; CHECK-NEXT: // %bb.4: // %if.else
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB2_5: // %if.else
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB2_6: // %if.then
; CHECK-NEXT: smstop sm
@@ -101,11 +115,12 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block
; CHECK-NEXT: // %bb.7: // %if.then
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB2_8: // %if.then
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
entry:
%cmp = icmp eq i32 %x, 0
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
index cd6d45f5..3afd571 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
declare void @normal_callee();
declare void @streaming_callee() "aarch64_pstate_sm_enabled";
@@ -8,11 +8,15 @@ declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible";
define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_streaming_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl streaming_compatible_callee
; CHECK-NEXT: bl streaming_compatible_callee
@@ -21,7 +25,7 @@ define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @streaming_compatible_callee();
@@ -47,26 +51,33 @@ define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_
define void @locally_streaming_multiple_exit(i64 %cond) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_multiple_exit:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: lsr x9, x9, #3
+; CHECK-NEXT: str x9, [sp, #-80]! // 8-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: cmp x0, #1
; CHECK-NEXT: b.ne .LBB2_2
; CHECK-NEXT: // %bb.1: // %if.else
; CHECK-NEXT: smstop sm
-; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #80
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB2_2: // %if.end
; CHECK-NEXT: smstop sm
-; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #80
; CHECK-NEXT: ret
entry:
@@ -87,11 +98,16 @@ if.end:
define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_no_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #80
-; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
+; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: index z0.d, #0, #1
@@ -102,12 +118,12 @@ define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_psta
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
-; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #80
+; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #96
; CHECK-NEXT: ret
%add = add <2 x i64> %a, <i64 41, i64 42>;
@@ -120,11 +136,15 @@ define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_psta
define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_locally_streaming_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl locally_streaming_caller_streaming_callee
@@ -134,7 +154,7 @@ define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @locally_streaming_caller_streaming_callee();
@@ -151,12 +171,16 @@ define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate
define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_compatible_callee_vec_args_ret:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
@@ -169,7 +193,7 @@ define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_compatible"
ret <2 x i64> %res;
@@ -180,12 +204,16 @@ declare <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64>) "aarch64_
define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct_arg_ret({<2 x i64>, <2 x i64>} %arg) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_compatible_callee_struct_arg_ret:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: sub sp, sp, #128
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #112] // 8-byte Folded Spill
; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
@@ -198,7 +226,7 @@ define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct
; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #112
+; CHECK-NEXT: add sp, sp, #128
; CHECK-NEXT: ret
%v1.arg = extractvalue {<2 x i64>, <2 x i64>} %arg, 1
%res = call {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64> %v1.arg) "aarch64_pstate_sm_compatible"
@@ -212,11 +240,16 @@ declare {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<
define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body" {
; CHECK-LABEL: locally_streaming_caller_alloca:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
; CHECK-NEXT: addsvl sp, sp, #-1
; CHECK-NEXT: smstart sm
; CHECK-NEXT: mov x0, sp
@@ -227,7 +260,7 @@ define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body"
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
%alloca = alloca <vscale x 4 x i32>
call void @use_ptr(ptr %alloca) "aarch64_pstate_sm_compatible"
@@ -239,12 +272,16 @@ declare void @use_ptr(ptr) "aarch64_pstate_sm_compatible"
define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_body" {
; CHECK-LABEL: call_to_intrinsic_without_chain:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: smstop sm
@@ -259,7 +296,7 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
entry:
%0 = call fast double @llvm.cos.f64(double %x)
@@ -272,11 +309,16 @@ declare double @llvm.cos.f64(double)
define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstate_sm_body" {
; CHECK-LABEL: test_arg_survives_loop:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #80
-; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
+; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB9_1: // %for.body
@@ -289,12 +331,12 @@ define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstat
; CHECK-NEXT: fadd s0, s1, s0
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
; CHECK-NEXT: smstop sm
-; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #80
+; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #96
; CHECK-NEXT: ret
entry:
br label %for.body
@@ -314,11 +356,15 @@ for.cond.cleanup:
define void @disable_tailcallopt() "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: disable_tailcallopt:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl streaming_compatible_callee
; CHECK-NEXT: smstop sm
@@ -326,7 +372,7 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_body" nounwind {
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
tail call void @streaming_compatible_callee();
ret void;
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 1e16f14..58992eb 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -mattr=+sve -mattr=+sme < %s | FileCheck %s
; This file tests the following combinations related to streaming-enabled functions:
; [ ] N -> SC (Normal -> Streaming-compatible)
@@ -36,11 +36,13 @@ define void @normal_caller_streaming_compatible_callee() nounwind {
define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_caller_normal_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB1_2
@@ -52,11 +54,12 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB1_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @normal_callee();
@@ -72,11 +75,13 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp
define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_caller_streaming_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB2_2
@@ -88,11 +93,12 @@ define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_c
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB2_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @streaming_callee();
@@ -124,11 +130,12 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
; CHECK-LABEL: streaming_compatible_with_neon_vectors:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: add x8, sp, #16
@@ -136,10 +143,10 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: add x8, sp, #16
-; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB4_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
@@ -160,8 +167,8 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -176,8 +183,9 @@ declare <2 x double> @normal_callee_vec_arg(<2 x double>)
define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale x 2 x double> %arg) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_with_scalable_vectors:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-18
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -255,8 +263,8 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
-; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x double> %arg)
%fadd = fadd <vscale x 2 x double> %res, %arg
@@ -268,8 +276,9 @@ declare <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x doub
define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x 2 x i1> %arg) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_with_predicate_vectors:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-18
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -347,8 +356,8 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
-; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1> %arg)
%and = and <vscale x 2 x i1> %res, %arg
@@ -360,11 +369,13 @@ declare <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1>)
define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: conditional_smstart_unreachable_block:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB7_2
@@ -372,6 +383,10 @@ define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatibl
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB7_2:
; CHECK-NEXT: bl streaming_callee
+; CHECK-NEXT: tbnz w19, #0, .LBB7_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB7_4:
call void @streaming_callee()
unreachable
}
@@ -381,11 +396,13 @@ define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_co
; CHECK: // %bb.0:
; CHECK-NEXT: tbz w0, #0, .LBB8_6
; CHECK-NEXT: // %bb.1: // %if.then
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB8_3
@@ -397,11 +414,12 @@ define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_co
; CHECK-NEXT: // %bb.4: // %if.then
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB8_5: // %if.then
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: .LBB8_6: // %exit
; CHECK-NEXT: ret
br i1 %p, label %if.then, label %exit
@@ -417,11 +435,13 @@ exit:
define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: disable_tailcallopt:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB9_2
@@ -433,11 +453,12 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB9_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
tail call void @normal_callee();
@@ -447,29 +468,32 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) "aarch64_pstate_sm_compatible" {
; CHECK-LABEL: call_to_non_streaming_pass_args:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: sub sp, sp, #128
+; CHECK-NEXT: .cfi_def_cfa_offset 128
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 112
-; CHECK-NEXT: .cfi_offset w19, -8
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: .cfi_offset b8, -24
-; CHECK-NEXT: .cfi_offset b9, -32
-; CHECK-NEXT: .cfi_offset b10, -40
-; CHECK-NEXT: .cfi_offset b11, -48
-; CHECK-NEXT: .cfi_offset b12, -56
-; CHECK-NEXT: .cfi_offset b13, -64
-; CHECK-NEXT: .cfi_offset b14, -72
-; CHECK-NEXT: .cfi_offset b15, -80
+; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #112] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x8, x1
; CHECK-NEXT: mov x9, x0
; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: .cfi_offset vg, -24
; CHECK-NEXT: tbz w19, #0, .LBB10_2
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: smstop sm
@@ -483,12 +507,25 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
; CHECK-NEXT: // %bb.3: // %entry
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB10_4: // %entry
-; CHECK-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_restore vg
; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #112] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #112
+; CHECK-NEXT: add sp, sp, #128
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
; CHECK-NEXT: ret
entry:
call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2)
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index 465fb466..4321493 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
; This file tests the following combinations related to streaming-enabled functions:
; [ ] N -> S (Normal -> Streaming)
@@ -22,10 +22,11 @@ define void @normal_caller_streaming_callee() nounwind {
; CHECK-LABEL: normal_caller_streaming_callee:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl streaming_callee
; CHECK-NEXT: smstop sm
@@ -47,10 +48,11 @@ define void @streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_enable
; CHECK-LABEL: streaming_caller_normal_callee:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl normal_callee
; CHECK-NEXT: smstart sm
@@ -103,10 +105,11 @@ define void @call_to_function_pointer_streaming_enabled(ptr %p) nounwind {
; CHECK-LABEL: call_to_function_pointer_streaming_enabled:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: blr x0
; CHECK-NEXT: smstop sm
@@ -125,19 +128,20 @@ define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind {
; CHECK-LABEL: smstart_clobber_simdfp:
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl streaming_callee
; CHECK-NEXT: smstop sm
-; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #96
@@ -150,7 +154,9 @@ define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind {
define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
; CHECK-LABEL: smstart_clobber_sve:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-18
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -216,7 +222,7 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @streaming_callee()
ret <vscale x 4 x i32> %x;
@@ -227,7 +233,9 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x) nounwind {
; CHECK-LABEL: smstart_clobber_sve_duplicate:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-18
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -296,7 +304,7 @@ define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x)
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @streaming_callee()
call void @streaming_callee()
@@ -308,11 +316,12 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
; CHECK-LABEL: call_to_intrinsic_without_chain:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: stp d0, d0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr d0, [sp] // 8-byte Folded Reload
@@ -320,11 +329,11 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
; CHECK-NEXT: str d0, [sp] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: ldp d1, d0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: fadd d0, d1, d0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: fadd d0, d1, d0
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #96
; CHECK-NEXT: ret
@@ -342,10 +351,11 @@ define void @disable_tailcallopt() nounwind {
; CHECK-LABEL: disable_tailcallopt:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl streaming_callee
; CHECK-NEXT: smstop sm
@@ -362,11 +372,13 @@ define void @disable_tailcallopt() nounwind {
define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone %ptr) #0 {
; CHECK-LABEL: call_to_non_streaming_pass_sve_objects:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: rdsvl x3, #1
; CHECK-NEXT: addvl x0, sp, #2
@@ -383,7 +395,7 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
entry:
%Data1 = alloca <vscale x 16 x i8>, align 16
@@ -400,11 +412,12 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
; CHECK-LABEL: call_to_non_streaming_pass_args:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill
; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
index 45ca784..ac19bd5 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
@@ -15,11 +15,12 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
; CHECK-LABEL: test_no_stackslot_scavenging:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x24, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x24, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
@@ -31,8 +32,8 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
; CHECK-NEXT: smstart sm
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x24, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x24, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -46,4 +47,4 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
declare void @use_f(float)
-attributes #0 = { nounwind "target-features"="+sme" "aarch64_pstate_sm_enabled" }
+attributes #0 = { nounwind "target-features"="+sve,+sme" "aarch64_pstate_sm_enabled" }
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
new file mode 100644
index 0000000..6264ce0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -0,0 +1,1177 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=FP-CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=NO-SVE-CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs -enable-machine-outliner < %s | FileCheck %s --check-prefix=OUTLINER-CHECK
+
+declare void @callee();
+declare void @fixed_callee(<4 x i32>);
+declare void @scalable_callee(<vscale x 2 x i64>);
+
+declare void @streaming_callee() #0;
+declare void @streaming_callee_with_arg(i32) #0;
+
+; Simple example of a function with one call requiring a streaming mode change
+;
+define void @vg_unwind_simple() #0 {
+; CHECK-LABEL: vg_unwind_simple:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 80
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: .cfi_offset b8, -24
+; CHECK-NEXT: .cfi_offset b9, -32
+; CHECK-NEXT: .cfi_offset b10, -40
+; CHECK-NEXT: .cfi_offset b11, -48
+; CHECK-NEXT: .cfi_offset b12, -56
+; CHECK-NEXT: .cfi_offset b13, -64
+; CHECK-NEXT: .cfi_offset b14, -72
+; CHECK-NEXT: .cfi_offset b15, -80
+; CHECK-NEXT: .cfi_offset vg, -8
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_simple:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: vg_unwind_simple:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void @callee();
+ ret void;
+}
+
+; As above, with an extra register clobbered by the inline asm call which
+; changes NeedsGapToAlignStack to false
+;
+define void @vg_unwind_needs_gap() #0 {
+; CHECK-LABEL: vg_unwind_needs_gap:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x20, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x20, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w20
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_needs_gap:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x9, x20, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w20, -8
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: //APP
+; FP-CHECK-NEXT: //NO_APP
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr x20, [sp, #88] // 8-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w20
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: vg_unwind_needs_gap:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void asm sideeffect "", "~{x20}"()
+ call void @callee();
+ ret void;
+}
+
+define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
+; CHECK-LABEL: vg_unwind_with_fixed_args:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: .cfi_offset b8, -24
+; CHECK-NEXT: .cfi_offset b9, -32
+; CHECK-NEXT: .cfi_offset b10, -40
+; CHECK-NEXT: .cfi_offset b11, -48
+; CHECK-NEXT: .cfi_offset b12, -56
+; CHECK-NEXT: .cfi_offset b13, -64
+; CHECK-NEXT: .cfi_offset b14, -72
+; CHECK-NEXT: .cfi_offset b15, -80
+; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_offset vg, -8
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: bl fixed_callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_with_fixed_args:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: sub sp, sp, #112
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 112
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #80
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; FP-CHECK-NEXT: bl fixed_callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 112
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #80] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: add sp, sp, #112
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: vg_unwind_with_fixed_args:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void @fixed_callee(<4 x i32> %x);
+ ret void;
+}
+
+define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
+; CHECK-LABEL: vg_unwind_with_sve_args:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w28, -8
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: addvl sp, sp, #-18
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
+; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 32 - 32 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 32 - 40 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 32 - 48 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 32 - 56 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 32 - 64 * VG
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 152 * VG
+; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: .cfi_offset vg, -16
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: bl scalable_callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #18
+; CHECK-NEXT: .cfi_def_cfa wsp, 32
+; CHECK-NEXT: .cfi_restore z8
+; CHECK-NEXT: .cfi_restore z9
+; CHECK-NEXT: .cfi_restore z10
+; CHECK-NEXT: .cfi_restore z11
+; CHECK-NEXT: .cfi_restore z12
+; CHECK-NEXT: .cfi_restore z13
+; CHECK-NEXT: .cfi_restore z14
+; CHECK-NEXT: .cfi_restore z15
+; CHECK-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w28
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_with_sve_args:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 48
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; FP-CHECK-NEXT: mov x29, sp
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 48
+; FP-CHECK-NEXT: .cfi_offset w27, -8
+; FP-CHECK-NEXT: .cfi_offset w28, -16
+; FP-CHECK-NEXT: .cfi_offset w30, -40
+; FP-CHECK-NEXT: .cfi_offset w29, -48
+; FP-CHECK-NEXT: addvl sp, sp, #-18
+; FP-CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: ptrue pn8.b
+; FP-CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 48 - 32 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 48 - 40 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 48 - 48 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG
+; FP-CHECK-NEXT: addvl sp, sp, #-1
+; FP-CHECK-NEXT: str z0, [x29, #-19, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: //APP
+; FP-CHECK-NEXT: //NO_APP
+; FP-CHECK-NEXT: .cfi_offset vg, -32
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: ldr z0, [x29, #-19, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: bl scalable_callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: addvl sp, sp, #1
+; FP-CHECK-NEXT: ptrue pn8.b
+; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: addvl sp, sp, #18
+; FP-CHECK-NEXT: .cfi_restore z8
+; FP-CHECK-NEXT: .cfi_restore z9
+; FP-CHECK-NEXT: .cfi_restore z10
+; FP-CHECK-NEXT: .cfi_restore z11
+; FP-CHECK-NEXT: .cfi_restore z12
+; FP-CHECK-NEXT: .cfi_restore z13
+; FP-CHECK-NEXT: .cfi_restore z14
+; FP-CHECK-NEXT: .cfi_restore z15
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 48
+; FP-CHECK-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w27
+; FP-CHECK-NEXT: .cfi_restore w28
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: vg_unwind_with_sve_args:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void asm sideeffect "", "~{x28}"()
+ call void @scalable_callee(<vscale x 2 x i64> %x);
+ ret void;
+}
+
+; This test was based on stack-probing-64k.ll and tries to test multiple uses of
+; findScratchNonCalleeSaveRegister.
+;
+define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
+; CHECK-LABEL: vg_unwind_multiple_scratch_regs:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680
+; CHECK-NEXT: .cfi_def_cfa w9, 327776
+; CHECK-NEXT: .LBB4_1: // %entry
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT: cmp sp, x9
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: b.ne .LBB4_1
+; CHECK-NEXT: // %bb.2: // %entry
+; CHECK-NEXT: .cfi_def_cfa_register wsp
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: str x8, [x0]
+; CHECK-NEXT: .cfi_offset vg, -16
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_multiple_scratch_regs:
+; FP-CHECK: // %bb.0: // %entry
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w28, -8
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680
+; FP-CHECK-NEXT: .LBB4_1: // %entry
+; FP-CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; FP-CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
+; FP-CHECK-NEXT: cmp sp, x9
+; FP-CHECK-NEXT: str xzr, [sp]
+; FP-CHECK-NEXT: b.ne .LBB4_1
+; FP-CHECK-NEXT: // %bb.2: // %entry
+; FP-CHECK-NEXT: mov x8, sp
+; FP-CHECK-NEXT: str x8, [x0]
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w28
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: vg_unwind_multiple_scratch_regs:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+entry:
+ %v = alloca i8, i64 327680, align 1
+ store ptr %v, ptr %out, align 8
+ call void @callee()
+ ret void
+}
+
+; Locally streaming functions require storing both the streaming and
+; non-streaming values of VG.
+;
+define void @vg_locally_streaming_fn() #3 {
+; CHECK-LABEL: vg_locally_streaming_fn:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset vg, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: bl streaming_callee
+; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_locally_streaming_fn:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: rdsvl x9, #1
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: lsr x9, x9, #3
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset vg, -8
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: bl streaming_callee
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: vg_locally_streaming_fn:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void @callee()
+ call void @streaming_callee()
+ call void @callee()
+ ret void
+}
+
+define void @streaming_compatible_to_streaming() #4 {
+; CHECK-LABEL: streaming_compatible_to_streaming:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: tbnz w19, #0, .LBB6_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB6_2:
+; CHECK-NEXT: bl streaming_callee
+; CHECK-NEXT: tbnz w19, #0, .LBB6_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB6_4:
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: streaming_compatible_to_streaming:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w19, -8
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: bl __arm_sme_state
+; FP-CHECK-NEXT: and x19, x0, #0x1
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: tbnz w19, #0, .LBB6_2
+; FP-CHECK-NEXT: // %bb.1:
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .LBB6_2:
+; FP-CHECK-NEXT: bl streaming_callee
+; FP-CHECK-NEXT: tbnz w19, #0, .LBB6_4
+; FP-CHECK-NEXT: // %bb.3:
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: .LBB6_4:
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w19
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: streaming_compatible_to_streaming:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void @streaming_callee()
+ ret void
+}
+
+define void @streaming_compatible_to_non_streaming() #4 {
+; CHECK-LABEL: streaming_compatible_to_non_streaming:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: tbz w19, #0, .LBB7_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB7_2:
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: tbz w19, #0, .LBB7_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB7_4:
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: streaming_compatible_to_non_streaming:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w19, -8
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: bl __arm_sme_state
+; FP-CHECK-NEXT: and x19, x0, #0x1
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: tbz w19, #0, .LBB7_2
+; FP-CHECK-NEXT: // %bb.1:
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: .LBB7_2:
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: tbz w19, #0, .LBB7_4
+; FP-CHECK-NEXT: // %bb.3:
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .LBB7_4:
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w19
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: streaming_compatible_to_non_streaming:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void @callee()
+ ret void
+}
+
+; If the target does not have SVE, do not emit cntd in the prologue and
+; instead spill the result returned by __arm_get_current_vg.
+; This requires preserving the argument %x as the vg value is returned
+; in X0.
+;
+define void @streaming_compatible_no_sve(i32 noundef %x) #4 {
+; NO-SVE-CHECK-LABEL: streaming_compatible_no_sve:
+; NO-SVE-CHECK: // %bb.0:
+; NO-SVE-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: .cfi_def_cfa_offset 96
+; NO-SVE-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: mov x9, x0
+; NO-SVE-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: bl __arm_get_current_vg
+; NO-SVE-CHECK-NEXT: stp x0, x19, [sp, #80] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: mov x0, x9
+; NO-SVE-CHECK-NEXT: add x29, sp, #64
+; NO-SVE-CHECK-NEXT: .cfi_def_cfa w29, 32
+; NO-SVE-CHECK-NEXT: .cfi_offset w19, -8
+; NO-SVE-CHECK-NEXT: .cfi_offset w30, -24
+; NO-SVE-CHECK-NEXT: .cfi_offset w29, -32
+; NO-SVE-CHECK-NEXT: .cfi_offset b8, -40
+; NO-SVE-CHECK-NEXT: .cfi_offset b9, -48
+; NO-SVE-CHECK-NEXT: .cfi_offset b10, -56
+; NO-SVE-CHECK-NEXT: .cfi_offset b11, -64
+; NO-SVE-CHECK-NEXT: .cfi_offset b12, -72
+; NO-SVE-CHECK-NEXT: .cfi_offset b13, -80
+; NO-SVE-CHECK-NEXT: .cfi_offset b14, -88
+; NO-SVE-CHECK-NEXT: .cfi_offset b15, -96
+; NO-SVE-CHECK-NEXT: mov w8, w0
+; NO-SVE-CHECK-NEXT: bl __arm_sme_state
+; NO-SVE-CHECK-NEXT: and x19, x0, #0x1
+; NO-SVE-CHECK-NEXT: .cfi_offset vg, -16
+; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB8_2
+; NO-SVE-CHECK-NEXT: // %bb.1:
+; NO-SVE-CHECK-NEXT: smstart sm
+; NO-SVE-CHECK-NEXT: .LBB8_2:
+; NO-SVE-CHECK-NEXT: mov w0, w8
+; NO-SVE-CHECK-NEXT: bl streaming_callee_with_arg
+; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB8_4
+; NO-SVE-CHECK-NEXT: // %bb.3:
+; NO-SVE-CHECK-NEXT: smstop sm
+; NO-SVE-CHECK-NEXT: .LBB8_4:
+; NO-SVE-CHECK-NEXT: .cfi_restore vg
+; NO-SVE-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; NO-SVE-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
+; NO-SVE-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT: .cfi_def_cfa_offset 0
+; NO-SVE-CHECK-NEXT: .cfi_restore w19
+; NO-SVE-CHECK-NEXT: .cfi_restore w30
+; NO-SVE-CHECK-NEXT: .cfi_restore w29
+; NO-SVE-CHECK-NEXT: .cfi_restore b8
+; NO-SVE-CHECK-NEXT: .cfi_restore b9
+; NO-SVE-CHECK-NEXT: .cfi_restore b10
+; NO-SVE-CHECK-NEXT: .cfi_restore b11
+; NO-SVE-CHECK-NEXT: .cfi_restore b12
+; NO-SVE-CHECK-NEXT: .cfi_restore b13
+; NO-SVE-CHECK-NEXT: .cfi_restore b14
+; NO-SVE-CHECK-NEXT: .cfi_restore b15
+; NO-SVE-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: streaming_compatible_no_sve:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void @streaming_callee_with_arg(i32 %x)
+ ret void
+}
+
+; Ensure we still emit async unwind information with -fno-asynchronous-unwind-tables
+; if the function contains a streaming-mode change.
+
+define void @vg_unwind_noasync() #5 {
+; CHECK-LABEL: vg_unwind_noasync:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 80
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: .cfi_offset b8, -24
+; CHECK-NEXT: .cfi_offset b9, -32
+; CHECK-NEXT: .cfi_offset b10, -40
+; CHECK-NEXT: .cfi_offset b11, -48
+; CHECK-NEXT: .cfi_offset b12, -56
+; CHECK-NEXT: .cfi_offset b13, -64
+; CHECK-NEXT: .cfi_offset b14, -72
+; CHECK-NEXT: .cfi_offset b15, -80
+; CHECK-NEXT: .cfi_offset vg, -8
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_noasync:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+; OUTLINER-CHECK-LABEL: vg_unwind_noasync:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
+ call void @callee();
+ ret void;
+}
+
+attributes #0 = { "aarch64_pstate_sm_enabled" uwtable(async) }
+attributes #1 = { "probe-stack"="inline-asm" "aarch64_pstate_sm_enabled" uwtable(async) }
+attributes #3 = { "aarch64_pstate_sm_body" uwtable(async) }
+attributes #4 = { "aarch64_pstate_sm_compatible" uwtable(async) }
+attributes #5 = { "aarch64_pstate_sm_enabled" }
diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
index c39894c..106d619 100644
--- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
+++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -aarch64-lower-to-sme-routines=false < %s | FileCheck %s -check-prefixes=CHECK-NO-SME-ROUTINES
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+mops -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-MOPS
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs -aarch64-lower-to-sme-routines=false < %s | FileCheck %s -check-prefixes=CHECK-NO-SME-ROUTINES
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -mattr=+mops -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-MOPS
@dst = global [512 x i8] zeroinitializer, align 1
@src = global [512 x i8] zeroinitializer, align 1
@@ -22,13 +22,14 @@ define void @se_memcpy(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
; CHECK-NO-SME-ROUTINES-LABEL: se_memcpy:
; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
-; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src]
; CHECK-NO-SME-ROUTINES-NEXT: smstop sm
@@ -71,12 +72,13 @@ define void @se_memset(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
; CHECK-NO-SME-ROUTINES-LABEL: se_memset:
; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
+; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: smstop sm
; CHECK-NO-SME-ROUTINES-NEXT: mov w1, #2 // =0x2
@@ -119,13 +121,14 @@ define void @se_memmove(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
; CHECK-NO-SME-ROUTINES-LABEL: se_memmove:
; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
-; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src]
; CHECK-NO-SME-ROUTINES-NEXT: smstop sm
@@ -168,16 +171,18 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
;
; CHECK-NO-SME-ROUTINES-LABEL: sc_memcpy:
; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry
-; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: bl __arm_sme_state
; CHECK-NO-SME-ROUTINES-NEXT: adrp x8, :got:dst
-; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
; CHECK-NO-SME-ROUTINES-NEXT: and x19, x0, #0x1
+; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
; CHECK-NO-SME-ROUTINES-NEXT: ldr x8, [x8, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src]
; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB3_2
@@ -190,11 +195,12 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-NO-SME-ROUTINES-NEXT: // %bb.3: // %entry
; CHECK-NO-SME-ROUTINES-NEXT: smstart sm
; CHECK-NO-SME-ROUTINES-NEXT: .LBB3_4: // %entry
-; CHECK-NO-SME-ROUTINES-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ret
;
; CHECK-MOPS-LABEL: sc_memcpy:
@@ -215,12 +221,16 @@ entry:
define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: sb_memcpy:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x2, x0
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: adrp x0, :got:dst
; CHECK-NEXT: adrp x1, :got:src
@@ -232,17 +242,21 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
;
; CHECK-NO-SME-ROUTINES-LABEL: sb_memcpy:
; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry
-; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: rdsvl x9, #1
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT: lsr x9, x9, #3
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
+; CHECK-NO-SME-ROUTINES-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: smstart sm
; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
@@ -256,15 +270,20 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
; CHECK-NO-SME-ROUTINES-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ret
;
; CHECK-MOPS-LABEL: sb_memcpy:
; CHECK-MOPS: // %bb.0: // %entry
-; CHECK-MOPS-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-MOPS-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT: rdsvl x9, #1
+; CHECK-MOPS-NEXT: lsr x9, x9, #3
+; CHECK-MOPS-NEXT: str x9, [sp, #-80]! // 8-byte Folded Spill
+; CHECK-MOPS-NEXT: cntd x9
+; CHECK-MOPS-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
+; CHECK-MOPS-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-MOPS-NEXT: smstart sm
; CHECK-MOPS-NEXT: adrp x8, :got:src
; CHECK-MOPS-NEXT: adrp x9, :got:dst
@@ -274,10 +293,11 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
; CHECK-MOPS-NEXT: cpyfm [x9]!, [x8]!, x0!
; CHECK-MOPS-NEXT: cpyfe [x9]!, [x8]!, x0!
; CHECK-MOPS-NEXT: smstop sm
-; CHECK-MOPS-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-MOPS-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT: add sp, sp, #80
; CHECK-MOPS-NEXT: ret
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)