diff options
Diffstat (limited to 'llvm')
18 files changed, 2167 insertions, 452 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index cf617c7..a991813 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -321,7 +321,7 @@ bool AArch64FrameLowering::homogeneousPrologEpilog( return false; auto *AFI = MF.getInfo<AArch64FunctionInfo>(); - if (AFI->hasSwiftAsyncContext()) + if (AFI->hasSwiftAsyncContext() || AFI->hasStreamingModeChanges()) return false; // If there are an odd number of GPRs before LR and FP in the CSRs list, @@ -558,6 +558,10 @@ void AArch64FrameLowering::emitCalleeSavedGPRLocations( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + SMEAttrs Attrs(MF.getFunction()); + bool LocallyStreaming = + Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface(); const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); if (CSI.empty()) @@ -569,14 +573,22 @@ void AArch64FrameLowering::emitCalleeSavedGPRLocations( DebugLoc DL = MBB.findDebugLoc(MBBI); for (const auto &Info : CSI) { - if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) + unsigned FrameIdx = Info.getFrameIdx(); + if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) continue; assert(!Info.isSpilledToReg() && "Spilling to registers not implemented"); - unsigned DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true); + int64_t DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true); + int64_t Offset = MFI.getObjectOffset(FrameIdx) - getOffsetOfLocalArea(); + + // The location of VG will be emitted before each streaming-mode change in + // the function. Only locally-streaming functions require emitting the + // non-streaming VG location here. + if ((LocallyStreaming && FrameIdx == AFI->getStreamingVGIdx()) || + (!LocallyStreaming && + DwarfReg == TRI.getDwarfRegNum(AArch64::VG, true))) + continue; - int64_t Offset = - MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea(); unsigned CFIIndex = MF.addFrameInst( MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) @@ -699,6 +711,9 @@ static void emitCalleeSavedRestores(MachineBasicBlock &MBB, !static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg)) continue; + if (!Info.isRestored()) + continue; + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore( nullptr, TRI.getDwarfRegNum(Info.getReg(), true))); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) @@ -1342,6 +1357,32 @@ static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI, ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize); } +bool requiresGetVGCall(MachineFunction &MF) { + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + return AFI->hasStreamingModeChanges() && + !MF.getSubtarget<AArch64Subtarget>().hasSVE(); +} + +bool isVGInstruction(MachineBasicBlock::iterator MBBI) { + unsigned Opc = MBBI->getOpcode(); + if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI || + Opc == AArch64::UBFMXri) + return true; + + if (requiresGetVGCall(*MBBI->getMF())) { + if (Opc == AArch64::ORRXrr) + return true; + + if (Opc == AArch64::BL) { + auto Op1 = MBBI->getOperand(0); + return Op1.isSymbol() && + (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg"); + } + } + + return false; +} + // Convert callee-save register save/restore instruction to do stack pointer // decrement/increment to allocate/deallocate the callee-save stack area by // converting store/load to use pre/post increment version. @@ -1352,6 +1393,17 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup, int CFAOffset = 0) { unsigned NewOpc; + + // If the function contains streaming mode changes, we expect instructions + // to calculate the value of VG before spilling. For locally-streaming + // functions, we need to do this for both the streaming and non-streaming + // vector length. Move past these instructions if necessary. + MachineFunction &MF = *MBB.getParent(); + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + if (AFI->hasStreamingModeChanges()) + while (isVGInstruction(MBBI)) + ++MBBI; + switch (MBBI->getOpcode()) { default: llvm_unreachable("Unexpected callee-save save/restore opcode!"); @@ -1408,7 +1460,6 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( // If the first store isn't right where we want SP then we can't fold the // update in so create a normal arithmetic instruction instead. - MachineFunction &MF = *MBB.getParent(); if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 || CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) { emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, @@ -1660,6 +1711,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, LiveRegs.removeReg(AArch64::X19); LiveRegs.removeReg(AArch64::FP); LiveRegs.removeReg(AArch64::LR); + + // X0 will be clobbered by a call to __arm_get_current_vg in the prologue. + // This is necessary to spill VG if required where SVE is unavailable, but + // X0 is preserved around this call. + if (requiresGetVGCall(MF)) + LiveRegs.removeReg(AArch64::X0); } auto VerifyClobberOnExit = make_scope_exit([&]() { @@ -1846,6 +1903,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // pointer bump above. while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) && !IsSVECalleeSave(MBBI)) { + // Move past instructions generated to calculate VG + if (AFI->hasStreamingModeChanges()) + while (isVGInstruction(MBBI)) + ++MBBI; + if (CombineSPBump) fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(), NeedsWinCFI, &HasWinCFI); @@ -2768,7 +2830,7 @@ struct RegPairInfo { unsigned Reg2 = AArch64::NoRegister; int FrameIdx; int Offset; - enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type; + enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type; RegPairInfo() = default; @@ -2780,6 +2842,7 @@ struct RegPairInfo { return 2; case GPR: case FPR64: + case VG: return 8; case ZPR: case FPR128: @@ -2855,6 +2918,8 @@ static void computeCalleeSaveRegisterPairs( RPI.Type = RegPairInfo::ZPR; else if (AArch64::PPRRegClass.contains(RPI.Reg1)) RPI.Type = RegPairInfo::PPR; + else if (RPI.Reg1 == AArch64::VG) + RPI.Type = RegPairInfo::VG; else llvm_unreachable("Unsupported register class."); @@ -2887,6 +2952,8 @@ static void computeCalleeSaveRegisterPairs( if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1)) RPI.Reg2 = NextReg; break; + case RegPairInfo::VG: + break; } } @@ -3003,6 +3070,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); bool NeedsWinCFI = needsWinCFI(MF); DebugLoc DL; SmallVector<RegPairInfo, 8> RegPairs; @@ -3070,7 +3138,70 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( Size = 2; Alignment = Align(2); break; + case RegPairInfo::VG: + StrOpc = AArch64::STRXui; + Size = 8; + Alignment = Align(8); + break; } + + unsigned X0Scratch = AArch64::NoRegister; + if (Reg1 == AArch64::VG) { + // Find an available register to store value of VG to. + Reg1 = findScratchNonCalleeSaveRegister(&MBB); + assert(Reg1 != AArch64::NoRegister); + SMEAttrs Attrs(MF.getFunction()); + + if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface() && + AFI->getStreamingVGIdx() == std::numeric_limits<int>::max()) { + // For locally-streaming functions, we need to store both the streaming + // & non-streaming VG. Spill the streaming value first. + BuildMI(MBB, MI, DL, TII.get(AArch64::RDSVLI_XI), Reg1) + .addImm(1) + .setMIFlag(MachineInstr::FrameSetup); + BuildMI(MBB, MI, DL, TII.get(AArch64::UBFMXri), Reg1) + .addReg(Reg1) + .addImm(3) + .addImm(63) + .setMIFlag(MachineInstr::FrameSetup); + + AFI->setStreamingVGIdx(RPI.FrameIdx); + } else if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) { + BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1) + .addImm(31) + .addImm(1) + .setMIFlag(MachineInstr::FrameSetup); + AFI->setVGIdx(RPI.FrameIdx); + } else { + const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); + if (llvm::any_of( + MBB.liveins(), + [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) { + return STI.getRegisterInfo()->isSuperOrSubRegisterEq( + AArch64::X0, LiveIn.PhysReg); + })) + X0Scratch = Reg1; + + if (X0Scratch != AArch64::NoRegister) + BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), Reg1) + .addReg(AArch64::XZR) + .addReg(AArch64::X0, RegState::Undef) + .addReg(AArch64::X0, RegState::Implicit) + .setMIFlag(MachineInstr::FrameSetup); + + const uint32_t *RegMask = TRI->getCallPreservedMask( + MF, + CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1); + BuildMI(MBB, MI, DL, TII.get(AArch64::BL)) + .addExternalSymbol("__arm_get_current_vg") + .addRegMask(RegMask) + .addReg(AArch64::X0, RegState::ImplicitDefine) + .setMIFlag(MachineInstr::FrameSetup); + Reg1 = AArch64::X0; + AFI->setVGIdx(RPI.FrameIdx); + } + } + LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); dbgs() << ") -> fi#(" << RPI.FrameIdx; @@ -3162,6 +3293,13 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( if (RPI.isPaired()) MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector); } + + if (X0Scratch != AArch64::NoRegister) + BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), AArch64::X0) + .addReg(AArch64::XZR) + .addReg(X0Scratch, RegState::Undef) + .addReg(X0Scratch, RegState::Implicit) + .setMIFlag(MachineInstr::FrameSetup); } return true; } @@ -3241,6 +3379,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( Size = 2; Alignment = Align(2); break; + case RegPairInfo::VG: + continue; } LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI); if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); @@ -3440,6 +3580,19 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, CSStackSize += RegSize; } + // Increase the callee-saved stack size if the function has streaming mode + // changes, as we will need to spill the value of the VG register. + // For locally streaming functions, we spill both the streaming and + // non-streaming VG value. + const Function &F = MF.getFunction(); + SMEAttrs Attrs(F); + if (AFI->hasStreamingModeChanges()) { + if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface()) + CSStackSize += 16; + else + CSStackSize += 8; + } + // Save number of saved regs, so we can easily update CSStackSize later. unsigned NumSavedRegs = SavedRegs.count(); @@ -3576,6 +3729,33 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; } + // Insert VG into the list of CSRs, immediately before LR if saved. + if (AFI->hasStreamingModeChanges()) { + std::vector<CalleeSavedInfo> VGSaves; + SMEAttrs Attrs(MF.getFunction()); + + auto VGInfo = CalleeSavedInfo(AArch64::VG); + VGInfo.setRestored(false); + VGSaves.push_back(VGInfo); + + // Add VG again if the function is locally-streaming, as we will spill two + // values. + if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface()) + VGSaves.push_back(VGInfo); + + bool InsertBeforeLR = false; + + for (unsigned I = 0; I < CSI.size(); I++) + if (CSI[I].getReg() == AArch64::LR) { + InsertBeforeLR = true; + CSI.insert(CSI.begin() + I, VGSaves.begin(), VGSaves.end()); + break; + } + + if (!InsertBeforeLR) + CSI.insert(CSI.end(), VGSaves.begin(), VGSaves.end()); + } + for (auto &CS : CSI) { Register Reg = CS.getReg(); const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); @@ -4191,12 +4371,58 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II, } } // namespace +MachineBasicBlock::iterator emitVGSaveRestore(MachineBasicBlock::iterator II, + const AArch64FrameLowering *TFI) { + MachineInstr &MI = *II; + MachineBasicBlock *MBB = MI.getParent(); + MachineFunction *MF = MBB->getParent(); + + if (MI.getOpcode() != AArch64::VGSavePseudo && + MI.getOpcode() != AArch64::VGRestorePseudo) + return II; + + SMEAttrs FuncAttrs(MF->getFunction()); + bool LocallyStreaming = + FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface(); + const AArch64FunctionInfo *AFI = MF->getInfo<AArch64FunctionInfo>(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + const AArch64InstrInfo *TII = + MF->getSubtarget<AArch64Subtarget>().getInstrInfo(); + + int64_t VGFrameIdx = + LocallyStreaming ? AFI->getStreamingVGIdx() : AFI->getVGIdx(); + assert(VGFrameIdx != std::numeric_limits<int>::max() && + "Expected FrameIdx for VG"); + + unsigned CFIIndex; + if (MI.getOpcode() == AArch64::VGSavePseudo) { + const MachineFrameInfo &MFI = MF->getFrameInfo(); + int64_t Offset = + MFI.getObjectOffset(VGFrameIdx) - TFI->getOffsetOfLocalArea(); + CFIIndex = MF->addFrameInst(MCCFIInstruction::createOffset( + nullptr, TRI->getDwarfRegNum(AArch64::VG, true), Offset)); + } else + CFIIndex = MF->addFrameInst(MCCFIInstruction::createRestore( + nullptr, TRI->getDwarfRegNum(AArch64::VG, true))); + + MachineInstr *UnwindInst = BuildMI(*MBB, II, II->getDebugLoc(), + TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + + MI.eraseFromParent(); + return UnwindInst->getIterator(); +} + void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced( MachineFunction &MF, RegScavenger *RS = nullptr) const { - if (StackTaggingMergeSetTag) - for (auto &BB : MF) - for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + for (auto &BB : MF) + for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) { + if (AFI->hasStreamingModeChanges()) + II = emitVGSaveRestore(II, this); + if (StackTaggingMergeSetTag) II = tryMergeAdjacentSTG(II, this, RS); + } } /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c4f819f..af8b9d9 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2493,6 +2493,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::FIRST_NUMBER: break; MAKE_CASE(AArch64ISD::COALESCER_BARRIER) + MAKE_CASE(AArch64ISD::VG_SAVE) + MAKE_CASE(AArch64ISD::VG_RESTORE) MAKE_CASE(AArch64ISD::SMSTART) MAKE_CASE(AArch64ISD::SMSTOP) MAKE_CASE(AArch64ISD::RESTORE_ZA) @@ -8514,6 +8516,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue InGlue; if (RequiresSMChange) { + + Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL, + DAG.getVTList(MVT::Other, MVT::Glue), Chain); + InGlue = Chain.getValue(1); + SDValue NewChain = changeStreamingMode( DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue, getSMCondition(CallerAttrs, CalleeAttrs), PStateSM); @@ -8691,6 +8698,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Result = changeStreamingMode( DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue, getSMCondition(CallerAttrs, CalleeAttrs), PStateSM); + InGlue = Result.getValue(1); + + Result = + DAG.getNode(AArch64ISD::VG_RESTORE, DL, + DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue}); } if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs)) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 48a4ea9..b57ba09 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -70,6 +70,9 @@ enum NodeType : unsigned { COALESCER_BARRIER, + VG_SAVE, + VG_RESTORE, + SMSTART, SMSTOP, RESTORE_ZA, diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp index c3d64f5..957d7bc 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp @@ -196,12 +196,14 @@ bool AArch64FunctionInfo::needsAsyncDwarfUnwindInfo( const MachineFunction &MF) const { if (!NeedsAsyncDwarfUnwindInfo) { const Function &F = MF.getFunction(); + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); // The check got "minsize" is because epilogue unwind info is not emitted // (yet) for homogeneous epilogues, outlined functions, and functions // outlined from. - NeedsAsyncDwarfUnwindInfo = needsDwarfUnwindInfo(MF) && - F.getUWTableKind() == UWTableKind::Async && - !F.hasMinSize(); + NeedsAsyncDwarfUnwindInfo = + needsDwarfUnwindInfo(MF) && + ((F.getUWTableKind() == UWTableKind::Async && !F.hasMinSize()) || + AFI->hasStreamingModeChanges()); } return *NeedsAsyncDwarfUnwindInfo; } diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index df09fc5..839a3a3 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H #define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H +#include "AArch64Subtarget.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -216,6 +217,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { // The PTRUE is used for the LD/ST of ZReg pairs in save and restore. unsigned PredicateRegForFillSpill = 0; + // The stack slots where VG values are stored to. + int64_t VGIdx = std::numeric_limits<int>::max(); + int64_t StreamingVGIdx = std::numeric_limits<int>::max(); + public: AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI); @@ -234,6 +239,12 @@ public: Register getPStateSMReg() const { return PStateSMReg; }; void setPStateSMReg(Register Reg) { PStateSMReg = Reg; }; + int64_t getVGIdx() const { return VGIdx; }; + void setVGIdx(unsigned Idx) { VGIdx = Idx; }; + + int64_t getStreamingVGIdx() const { return StreamingVGIdx; }; + void setStreamingVGIdx(unsigned FrameIdx) { StreamingVGIdx = FrameIdx; }; + bool isSVECC() const { return IsSVECC; }; void setIsSVECC(bool s) { IsSVECC = s; }; diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 2b70c47..fea70b7 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -31,6 +31,12 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2, def AArch64CoalescerBarrier : SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>; +def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>, + [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; + +def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>, + [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; + //===----------------------------------------------------------------------===// // Instruction naming conventions. //===----------------------------------------------------------------------===// @@ -221,6 +227,15 @@ def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 /*AArch64SME::Always*/0)), (MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>; +// Pseudo to insert cfi_offset/cfi_restore instructions. Used to save or restore +// the streaming value of VG around streaming-mode changes in locally-streaming +// functions. +def VGSavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; +def : Pat<(AArch64VGSave), (VGSavePseudo)>; + +def VGRestorePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; +def : Pat<(AArch64VGRestore), (VGRestorePseudo)>; + //===----------------------------------------------------------------------===// // SME2 Instructions //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll b/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll index 44d47a0..aae1a66 100644 --- a/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll +++ b/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll @@ -7,10 +7,11 @@ define void @streaming_mode_change1() #0 { ; CHECK-LABEL: streaming_mode_change1: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm @@ -23,6 +24,7 @@ define void @streaming_mode_change1() #0 { ; ; OUTLINER-LABEL: streaming_mode_change1: ; OUTLINER-NOT: OUTLINED_FUNCTION_ +; call void @callee(); ret void; } @@ -31,10 +33,11 @@ define void @streaming_mode_change2() #0 { ; CHECK-LABEL: streaming_mode_change2: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm @@ -47,6 +50,7 @@ define void @streaming_mode_change2() #0 { ; ; OUTLINER-LABEL: streaming_mode_change2: ; OUTLINER-NOT: OUTLINED_FUNCTION_ +; call void @callee(); ret void; } @@ -55,10 +59,11 @@ define void @streaming_mode_change3() #0 { ; CHECK-LABEL: streaming_mode_change3: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm @@ -71,6 +76,7 @@ define void @streaming_mode_change3() #0 { ; ; OUTLINER-LABEL: streaming_mode_change3: ; OUTLINER-NOT: OUTLINED_FUNCTION_ +; call void @callee(); ret void; } diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll index 0737719..c4440e7 100644 --- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll +++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll @@ -10,11 +10,13 @@ target triple = "aarch64" define void @streaming_compatible() #0 { ; CHECK-LABEL: streaming_compatible: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: bl __arm_get_current_vg +; CHECK-NEXT: stp x0, x19, [sp, #72] // 16-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbz w19, #0, .LBB0_2 @@ -26,11 +28,12 @@ define void @streaming_compatible() #0 { ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @non_streaming() ret void @@ -44,12 +47,14 @@ declare void @non_streaming() define void @streaming_compatible_arg(float %f) #0 { ; CHECK-LABEL: streaming_compatible_arg: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: bl __arm_get_current_vg +; CHECK-NEXT: stp x0, x19, [sp, #88] // 16-byte Folded Spill ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 @@ -63,12 +68,13 @@ define void @streaming_compatible_arg(float %f) #0 { ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB1_4: -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret call void @non_streaming(float %f) ret void diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index 254e37e..d786ffd 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -fast-isel=true -global-isel=false -fast-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s \ +; RUN: llc -fast-isel=true -global-isel=false -fast-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 < %s \ ; RUN: | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-FISEL -; RUN: llc -fast-isel=false -global-isel=true -global-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s \ +; RUN: llc -fast-isel=false -global-isel=true -global-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 < %s \ ; RUN: | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-GISEL @@ -17,6 +17,8 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline ; CHECK-FISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-FISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-FISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-FISEL-NEXT: cntd x9 +; CHECK-FISEL-NEXT: str x9, [sp, #88] // 8-byte Folded Spill ; CHECK-FISEL-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-FISEL-NEXT: smstart sm ; CHECK-FISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload @@ -43,6 +45,8 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline ; CHECK-GISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-GISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-GISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-GISEL-NEXT: cntd x9 +; CHECK-GISEL-NEXT: str x9, [sp, #88] // 8-byte Folded Spill ; CHECK-GISEL-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-GISEL-NEXT: smstart sm ; CHECK-GISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload @@ -76,6 +80,8 @@ define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: cntd x9 +; CHECK-COMMON-NEXT: str x9, [sp, #88] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstop sm ; CHECK-COMMON-NEXT: ldr d0, [sp] // 8-byte Folded Reload @@ -102,12 +108,17 @@ entry: define double @locally_streaming_caller_normal_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_body" { ; CHECK-COMMON-LABEL: locally_streaming_caller_normal_callee: ; CHECK-COMMON: // %bb.0: -; CHECK-COMMON-NEXT: sub sp, sp, #112 +; CHECK-COMMON-NEXT: sub sp, sp, #128 ; CHECK-COMMON-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: rdsvl x9, #1 +; CHECK-COMMON-NEXT: lsr x9, x9, #3 +; CHECK-COMMON-NEXT: str x9, [sp, #104] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: cntd x9 +; CHECK-COMMON-NEXT: str x9, [sp, #112] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: str d0, [sp, #24] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm ; CHECK-COMMON-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload @@ -129,7 +140,7 @@ define double @locally_streaming_caller_normal_callee(double %x) nounwind noinli ; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload -; CHECK-COMMON-NEXT: add sp, sp, #112 +; CHECK-COMMON-NEXT: add sp, sp, #128 ; CHECK-COMMON-NEXT: ret %call = call double @normal_callee(double %x); %add = fadd double %call, 4.200000e+01 @@ -166,11 +177,16 @@ define double @normal_caller_to_locally_streaming_callee(double %x) nounwind noi define void @locally_streaming_caller_streaming_callee_ptr(ptr %p) nounwind noinline optnone "aarch64_pstate_sm_body" { ; CHECK-COMMON-LABEL: locally_streaming_caller_streaming_callee_ptr: ; CHECK-COMMON: // %bb.0: -; CHECK-COMMON-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: rdsvl x9, #1 +; CHECK-COMMON-NEXT: lsr x9, x9, #3 +; CHECK-COMMON-NEXT: str x9, [sp, #72] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: cntd x9 +; CHECK-COMMON-NEXT: str x9, [sp, #80] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm ; CHECK-COMMON-NEXT: blr x0 ; CHECK-COMMON-NEXT: smstop sm @@ -178,7 +194,7 @@ define void @locally_streaming_caller_streaming_callee_ptr(ptr %p) nounwind noin ; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-COMMON-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ret call void %p() "aarch64_pstate_sm_enabled" ret void @@ -192,6 +208,8 @@ define void @normal_call_to_streaming_callee_ptr(ptr %p) nounwind noinline optno ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: cntd x9 +; CHECK-COMMON-NEXT: str x9, [sp, #72] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm ; CHECK-COMMON-NEXT: blr x0 ; CHECK-COMMON-NEXT: smstop sm @@ -214,7 +232,8 @@ declare double @za_shared_callee(double) "aarch64_inout_za" define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_new_za"{ ; CHECK-COMMON-LABEL: za_new_caller_to_za_shared_callee: ; CHECK-COMMON: // %bb.0: // %prelude -; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 @@ -240,7 +259,8 @@ define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o ; CHECK-COMMON-NEXT: fadd d0, d0, d1 ; CHECK-COMMON-NEXT: smstop za ; CHECK-COMMON-NEXT: mov sp, x29 -; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ret entry: %call = call double @za_shared_callee(double %x) @@ -251,7 +271,8 @@ entry: define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline optnone "aarch64_inout_za"{ ; CHECK-COMMON-LABEL: za_shared_caller_to_za_none_callee: ; CHECK-COMMON: // %bb.0: // %entry -; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 @@ -279,7 +300,8 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: fmov d1, x8 ; CHECK-COMMON-NEXT: fadd d0, d0, d1 ; CHECK-COMMON-NEXT: mov sp, x29 -; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ret entry: %call = call double @normal_callee(double %x) @@ -291,7 +313,8 @@ entry: define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-LABEL: f128_call_za: ; CHECK-COMMON: // %bb.0: -; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 @@ -314,7 +337,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: .LBB8_2: ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr ; CHECK-COMMON-NEXT: mov sp, x29 -; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ret %res = fadd fp128 %a, %b ret fp128 %res @@ -326,21 +350,22 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw ; CHECK-COMMON-LABEL: f128_call_sm: ; CHECK-COMMON: // %bb.0: ; CHECK-COMMON-NEXT: sub sp, sp, #112 +; CHECK-COMMON-NEXT: cntd x9 ; CHECK-COMMON-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill -; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill ; CHECK-COMMON-NEXT: smstop sm ; CHECK-COMMON-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload ; CHECK-COMMON-NEXT: bl __addtf3 ; CHECK-COMMON-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm -; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: add sp, sp, #112 @@ -353,7 +378,8 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-LABEL: frem_call_za: ; CHECK-COMMON: // %bb.0: -; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 @@ -376,7 +402,8 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: .LBB10_2: ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr ; CHECK-COMMON-NEXT: mov sp, x29 -; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ret %res = frem double %a, %b ret double %res @@ -387,21 +414,22 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw ; CHECK-COMMON-LABEL: frem_call_sm: ; CHECK-COMMON: // %bb.0: ; CHECK-COMMON-NEXT: sub sp, sp, #96 +; CHECK-COMMON-NEXT: cntd x9 ; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstop sm ; CHECK-COMMON-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: bl fmodf ; CHECK-COMMON-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm -; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload -; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: add sp, sp, #96 @@ -414,12 +442,14 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compatible" nounwind { ; CHECK-COMMON-LABEL: frem_call_sm_compat: ; CHECK-COMMON: // %bb.0: -; CHECK-COMMON-NEXT: sub sp, sp, #96 +; CHECK-COMMON-NEXT: sub sp, sp, #112 +; CHECK-COMMON-NEXT: cntd x9 ; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-COMMON-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: bl __arm_sme_state ; CHECK-COMMON-NEXT: and x19, x0, #0x1 @@ -434,13 +464,14 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati ; CHECK-COMMON-NEXT: // %bb.3: ; CHECK-COMMON-NEXT: smstart sm ; CHECK-COMMON-NEXT: .LBB12_4: -; CHECK-COMMON-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-COMMON-NEXT: add sp, sp, #96 +; CHECK-COMMON-NEXT: add sp, sp, #112 ; CHECK-COMMON-NEXT: ret %res = frem float %a, %b ret float %res diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index 9d635f0..b0d6e04 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64 -mattr=+sme < %s | FileCheck %s +; RUN: llc -mtriple=aarch64 -mattr=+sve -mattr=+sme < %s | FileCheck %s declare void @private_za_callee() declare float @llvm.cos.f32(float) @@ -8,7 +8,8 @@ declare float @llvm.cos.f32(float) define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { ; CHECK-LABEL: test_lazy_save_1_callee: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 @@ -31,7 +32,8 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { ; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @private_za_callee() ret void @@ -41,20 +43,21 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-LABEL: test_lazy_save_2_callees: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill ; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x19, #1 +; CHECK-NEXT: rdsvl x20, #1 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: msub x8, x19, x19, x8 +; CHECK-NEXT: msub x8, x20, x20, x8 ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: sub x21, x29, #16 ; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh w19, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x21 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -64,8 +67,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-NEXT: bl __arm_tpidr2_restore ; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: sturh w19, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x21 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -76,8 +79,9 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @private_za_callee() call void @private_za_callee() @@ -88,7 +92,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inout_za" { ; CHECK-LABEL: test_lazy_save_expanded_intrinsic: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 @@ -111,7 +116,8 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou ; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call float @llvm.cos.f32(float %a) ret float %res @@ -121,13 +127,15 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za" "aarch64_pstate_sm_compatible" { ; CHECK-LABEL: test_lazy_save_and_conditional_smstart: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: add x29, sp, #64 -; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp @@ -140,13 +148,13 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEXT: sturh w8, [x29, #-72] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: and x19, x0, #0x1 -; CHECK-NEXT: tbz w19, #0, .LBB3_2 +; CHECK-NEXT: and x20, x0, #0x1 +; CHECK-NEXT: tbz w20, #0, .LBB3_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB3_2: ; CHECK-NEXT: bl private_za_callee -; CHECK-NEXT: tbz w19, #0, .LBB3_4 +; CHECK-NEXT: tbz w20, #0, .LBB3_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB3_4: @@ -159,12 +167,12 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEXT: .LBB3_6: ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: sub sp, x29, #64 +; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @private_za_callee() ret void diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll index 1d1bae4..500c511 100644 --- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll +++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll @@ -16,11 +16,12 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: mov x19, x1 @@ -28,12 +29,12 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -49,11 +50,12 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: mov x19, x1 @@ -61,12 +63,12 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -82,11 +84,12 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: mov x19, x1 @@ -94,12 +97,12 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -115,11 +118,12 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: fmov d0, x0 ; CHECK-NEXT: mov x19, x1 @@ -127,12 +131,12 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -148,11 +152,12 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 @@ -165,14 +170,14 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_f16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -188,11 +193,12 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 @@ -205,14 +211,14 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: bl use_f32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -228,11 +234,12 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 @@ -245,14 +252,14 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_f64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -273,11 +280,12 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v1i8: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 @@ -290,14 +298,14 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -314,11 +322,12 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v1i16: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 @@ -331,14 +340,14 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -355,11 +364,12 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v1i32: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 @@ -372,14 +382,14 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -396,11 +406,12 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v1i64: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 @@ -413,14 +424,14 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -437,11 +448,12 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v1f16: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 @@ -454,14 +466,14 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -478,11 +490,12 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v1f32: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 @@ -495,14 +508,14 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v4f32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -519,11 +532,12 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 @@ -536,14 +550,14 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -564,11 +578,12 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 @@ -581,14 +596,14 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -604,11 +619,12 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 @@ -621,14 +637,14 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -644,11 +660,12 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 @@ -661,14 +678,14 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -684,11 +701,12 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 @@ -701,14 +719,14 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -724,11 +742,12 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 @@ -741,14 +760,14 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -764,11 +783,12 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v8bf16: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 @@ -781,14 +801,14 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8bf16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -804,11 +824,12 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 @@ -821,14 +842,14 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4f32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -844,11 +865,12 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 @@ -861,14 +883,14 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -887,11 +909,12 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_arg_v8i1: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 @@ -900,10 +923,10 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 { ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 -; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: and z1.b, z1.b, #0x1 ; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0 ; CHECK-NEXT: str p0, [x8, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v8i1 @@ -913,8 +936,8 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 { ; CHECK-NEXT: str p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -933,23 +956,26 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 { define void @dont_coalesce_res_i8(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call i8 @get_i8() %vec = insertelement <vscale x 16 x i8> poison, i8 %res, i32 0 @@ -960,23 +986,26 @@ define void @dont_coalesce_res_i8(ptr %ptr) #0 { define void @dont_coalesce_res_i16(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call i16 @get_i16() %vec = insertelement <vscale x 8 x i16> poison, i16 %res, i32 0 @@ -987,23 +1016,26 @@ define void @dont_coalesce_res_i16(ptr %ptr) #0 { define void @dont_coalesce_res_i32(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call i32 @get_i32() %vec = insertelement <vscale x 4 x i32> poison, i32 %res, i32 0 @@ -1014,23 +1046,26 @@ define void @dont_coalesce_res_i32(ptr %ptr) #0 { define void @dont_coalesce_res_i64(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: fmov d0, x0 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmov d0, x0 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call i64 @get_i64() %vec = insertelement <vscale x 2 x i64> poison, i64 %res, i32 0 @@ -1041,27 +1076,30 @@ define void @dont_coalesce_res_i64(ptr %ptr) #0 { define void @dont_coalesce_res_f16(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_f16 ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call half @get_f16() %vec = insertelement <vscale x 8 x half> poison, half %res, i32 0 @@ -1072,12 +1110,14 @@ define void @dont_coalesce_res_f16(ptr %ptr) #0 { define void @dont_coalesce_res_f32(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_f32 @@ -1087,11 +1127,12 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 { ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call float @get_f32() %vec = insertelement <vscale x 4 x float> poison, float %res, i32 0 @@ -1102,12 +1143,14 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 { define void @dont_coalesce_res_f64(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_f64 @@ -1117,11 +1160,12 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call double @get_f64() %vec = insertelement <vscale x 2 x double> poison, double %res, i32 0 @@ -1136,12 +1180,14 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 { define void @dont_coalesce_res_v1i8(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v1i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v1i8 @@ -1151,11 +1197,12 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call <1 x i8> @get_v1i8() %elt = extractelement <1 x i8> %res, i32 0 @@ -1167,12 +1214,14 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 { define void @dont_coalesce_res_v1i16(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v1i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v1i16 @@ -1182,11 +1231,12 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call <1 x i16> @get_v1i16() %elt = extractelement <1 x i16> %res, i32 0 @@ -1198,12 +1248,14 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 { define void @dont_coalesce_res_v1i32(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v1i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v1i32 @@ -1213,11 +1265,12 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call <1 x i32> @get_v1i32() %elt = extractelement <1 x i32> %res, i32 0 @@ -1229,12 +1282,14 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 { define void @dont_coalesce_res_v1i64(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v1i64 @@ -1244,11 +1299,12 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call <1 x i64> @get_v1i64() %elt = extractelement <1 x i64> %res, i32 0 @@ -1260,27 +1316,30 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 { define void @dont_coalesce_res_v1f16(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v1f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v1f16 ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call <1 x half> @get_v1f16() %elt = extractelement <1 x half> %res, i32 0 @@ -1292,12 +1351,14 @@ define void @dont_coalesce_res_v1f16(ptr %ptr) #0 { define void @dont_coalesce_res_v1f32(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v1f32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v1f32 @@ -1307,11 +1368,12 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call <1 x float> @get_v1f32() %elt = extractelement <1 x float> %res, i32 0 @@ -1323,12 +1385,14 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 { define void @dont_coalesce_res_v1f64(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v1f64: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v1f64 @@ -1338,11 +1402,12 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call <1 x double> @get_v1f64() %elt = extractelement <1 x double> %res, i32 0 @@ -1358,27 +1423,30 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 { define void @dont_coalesce_res_v16i8(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v16i8 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call <16 x i8> @get_v16i8() %vec = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> %res, i64 0) @@ -1389,27 +1457,30 @@ define void @dont_coalesce_res_v16i8(ptr %ptr) #0 { define void @dont_coalesce_res_v8i16(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v8i16 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call <8 x i16> @get_v8i16() %vec = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> poison, <8 x i16> %res, i64 0) @@ -1420,27 +1491,30 @@ define void @dont_coalesce_res_v8i16(ptr %ptr) #0 { define void @dont_coalesce_res_v4i32(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v4i32 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call <4 x i32> @get_v4i32() %vec = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> %res, i64 0) @@ -1451,27 +1525,30 @@ define void @dont_coalesce_res_v4i32(ptr %ptr) #0 { define void @dont_coalesce_res_v2i64(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v2i64 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call <2 x i64> @get_v2i64() %vec = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> %res, i64 0) @@ -1482,27 +1559,30 @@ define void @dont_coalesce_res_v2i64(ptr %ptr) #0 { define void @dont_coalesce_res_v8f16(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v8f16 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call <8 x half> @get_v8f16() %vec = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %res, i64 0) @@ -1513,27 +1593,30 @@ define void @dont_coalesce_res_v8f16(ptr %ptr) #0 { define void @dont_coalesce_res_v4f32(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v4f32 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call <4 x float> @get_v4f32() %vec = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> %res, i64 0) @@ -1544,27 +1627,30 @@ define void @dont_coalesce_res_v4f32(ptr %ptr) #0 { define void @dont_coalesce_res_v2f64(ptr %ptr) #0 { ; CHECK-LABEL: dont_coalesce_res_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_v2f64 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call <2 x double> @get_v2f64() %vec = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> poison, <2 x double> %res, i64 0) diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll index d675733..6c8aff5 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s declare void @normal_callee(); declare void @streaming_callee() "aarch64_pstate_sm_enabled"; @@ -8,11 +8,15 @@ declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"; define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: sm_body_sm_compatible_simple: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x8, x0, #0x1 ; CHECK-NEXT: tbnz w8, #0, .LBB0_2 @@ -28,7 +32,7 @@ define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aar ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret ret float zeroinitializer } @@ -36,11 +40,15 @@ define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aar define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: sm_body_caller_sm_compatible_caller_normal_callee: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbnz w19, #0, .LBB1_2 @@ -54,11 +62,12 @@ define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB1_4: -; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @normal_callee() ret void @@ -68,12 +77,16 @@ define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate define void @streaming_body_and_streaming_compatible_interface_multi_basic_block(i32 noundef %x) "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: streaming_body_and_streaming_compatible_interface_multi_basic_block: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbnz w19, #0, .LBB2_2 @@ -87,11 +100,12 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block ; CHECK-NEXT: // %bb.4: // %if.else ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB2_5: // %if.else -; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB2_6: // %if.then ; CHECK-NEXT: smstop sm @@ -101,11 +115,12 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block ; CHECK-NEXT: // %bb.7: // %if.then ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB2_8: // %if.then -; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret entry: %cmp = icmp eq i32 %x, 0 diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll index cd6d45f5..3afd571 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s declare void @normal_callee(); declare void @streaming_callee() "aarch64_pstate_sm_enabled"; @@ -8,11 +8,15 @@ declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"; define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: locally_streaming_caller_streaming_callee: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: bl streaming_compatible_callee ; CHECK-NEXT: bl streaming_compatible_callee @@ -21,7 +25,7 @@ define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @streaming_compatible_callee(); @@ -47,26 +51,33 @@ define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_ define void @locally_streaming_multiple_exit(i64 %cond) "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: locally_streaming_multiple_exit: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: lsr x9, x9, #3 +; CHECK-NEXT: str x9, [sp, #-80]! // 8-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: cmp x0, #1 ; CHECK-NEXT: b.ne .LBB2_2 ; CHECK-NEXT: // %bb.1: // %if.else ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB2_2: // %if.end ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret entry: @@ -87,11 +98,16 @@ if.end: define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: locally_streaming_caller_no_callee: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: lsr x9, x9, #3 +; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: str x9, [sp, #24] // 8-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: index z0.d, #0, #1 @@ -102,12 +118,12 @@ define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_psta ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #80 +; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %add = add <2 x i64> %a, <i64 41, i64 42>; @@ -120,11 +136,15 @@ define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_psta define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: locally_streaming_caller_locally_streaming_callee: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl locally_streaming_caller_streaming_callee @@ -134,7 +154,7 @@ define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @locally_streaming_caller_streaming_callee(); @@ -151,12 +171,16 @@ define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: locally_streaming_caller_compatible_callee_vec_args_ret: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload @@ -169,7 +193,7 @@ define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %res = call <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_compatible" ret <2 x i64> %res; @@ -180,12 +204,16 @@ declare <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64>) "aarch64_ define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct_arg_ret({<2 x i64>, <2 x i64>} %arg) "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: locally_streaming_caller_compatible_callee_struct_arg_ret: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: sub sp, sp, #128 +; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #112] // 8-byte Folded Spill ; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload @@ -198,7 +226,7 @@ define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct ; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #112 +; CHECK-NEXT: add sp, sp, #128 ; CHECK-NEXT: ret %v1.arg = extractvalue {<2 x i64>, <2 x i64>} %arg, 1 %res = call {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64> %v1.arg) "aarch64_pstate_sm_compatible" @@ -212,11 +240,16 @@ declare {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(< define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body" { ; CHECK-LABEL: locally_streaming_caller_alloca: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #88] // 8-byte Folded Spill ; CHECK-NEXT: addsvl sp, sp, #-1 ; CHECK-NEXT: smstart sm ; CHECK-NEXT: mov x0, sp @@ -227,7 +260,7 @@ define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body" ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %alloca = alloca <vscale x 4 x i32> call void @use_ptr(ptr %alloca) "aarch64_pstate_sm_compatible" @@ -239,12 +272,16 @@ declare void @use_ptr(ptr) "aarch64_pstate_sm_compatible" define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_body" { ; CHECK-LABEL: call_to_intrinsic_without_chain: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: smstop sm @@ -259,7 +296,7 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret entry: %0 = call fast double @llvm.cos.f64(double %x) @@ -272,11 +309,16 @@ declare double @llvm.cos.f64(double) define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstate_sm_body" { ; CHECK-LABEL: test_arg_survives_loop: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: lsr x9, x9, #3 +; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: str x9, [sp, #24] // 8-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB9_1: // %for.body @@ -289,12 +331,12 @@ define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstat ; CHECK-NEXT: fadd s0, s1, s0 ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #80 +; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret entry: br label %for.body @@ -314,11 +356,15 @@ for.cond.cleanup: define void @disable_tailcallopt() "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: disable_tailcallopt: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: bl streaming_compatible_callee ; CHECK-NEXT: smstop sm @@ -326,7 +372,7 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_body" nounwind { ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret tail call void @streaming_compatible_callee(); ret void; diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index 1e16f14..58992eb 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -mattr=+sme < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -mattr=+sve -mattr=+sme < %s | FileCheck %s ; This file tests the following combinations related to streaming-enabled functions: ; [ ] N -> SC (Normal -> Streaming-compatible) @@ -36,11 +36,13 @@ define void @normal_caller_streaming_compatible_callee() nounwind { define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_compatible" nounwind { ; CHECK-LABEL: streaming_compatible_caller_normal_callee: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbz w19, #0, .LBB1_2 @@ -52,11 +54,12 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB1_4: -; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @normal_callee(); @@ -72,11 +75,13 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_compatible" nounwind { ; CHECK-LABEL: streaming_compatible_caller_streaming_callee: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbnz w19, #0, .LBB2_2 @@ -88,11 +93,12 @@ define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_c ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB2_4: -; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @streaming_callee(); @@ -124,11 +130,12 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) " ; CHECK-LABEL: streaming_compatible_with_neon_vectors: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: add x8, sp, #16 @@ -136,10 +143,10 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) " ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: add x8, sp, #16 -; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbz w19, #0, .LBB4_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm @@ -160,8 +167,8 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) " ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -176,8 +183,9 @@ declare <2 x double> @normal_callee_vec_arg(<2 x double>) define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale x 2 x double> %arg) "aarch64_pstate_sm_compatible" nounwind { ; CHECK-LABEL: streaming_compatible_with_scalable_vectors: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-18 ; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill @@ -255,8 +263,8 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale ; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #18 -; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x double> %arg) %fadd = fadd <vscale x 2 x double> %res, %arg @@ -268,8 +276,9 @@ declare <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x doub define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x 2 x i1> %arg) "aarch64_pstate_sm_compatible" nounwind { ; CHECK-LABEL: streaming_compatible_with_predicate_vectors: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-18 ; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill @@ -347,8 +356,8 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x ; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #18 -; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1> %arg) %and = and <vscale x 2 x i1> %res, %arg @@ -360,11 +369,13 @@ declare <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1>) define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatible" nounwind { ; CHECK-LABEL: conditional_smstart_unreachable_block: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbnz w19, #0, .LBB7_2 @@ -372,6 +383,10 @@ define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatibl ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB7_2: ; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: tbnz w19, #0, .LBB7_4 +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB7_4: call void @streaming_callee() unreachable } @@ -381,11 +396,13 @@ define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_co ; CHECK: // %bb.0: ; CHECK-NEXT: tbz w0, #0, .LBB8_6 ; CHECK-NEXT: // %bb.1: // %if.then -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbnz w19, #0, .LBB8_3 @@ -397,11 +414,12 @@ define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_co ; CHECK-NEXT: // %bb.4: // %if.then ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB8_5: // %if.then -; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: .LBB8_6: // %exit ; CHECK-NEXT: ret br i1 %p, label %if.then, label %exit @@ -417,11 +435,13 @@ exit: define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind { ; CHECK-LABEL: disable_tailcallopt: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbz w19, #0, .LBB9_2 @@ -433,11 +453,12 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind { ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB9_4: -; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret tail call void @normal_callee(); @@ -447,29 +468,32 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind { define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) "aarch64_pstate_sm_compatible" { ; CHECK-LABEL: call_to_non_streaming_pass_args: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: sub sp, sp, #128 +; CHECK-NEXT: .cfi_def_cfa_offset 128 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #96] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 112 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: .cfi_offset b8, -24 -; CHECK-NEXT: .cfi_offset b9, -32 -; CHECK-NEXT: .cfi_offset b10, -40 -; CHECK-NEXT: .cfi_offset b11, -48 -; CHECK-NEXT: .cfi_offset b12, -56 -; CHECK-NEXT: .cfi_offset b13, -64 -; CHECK-NEXT: .cfi_offset b14, -72 -; CHECK-NEXT: .cfi_offset b15, -80 +; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #112] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: .cfi_offset b8, -40 +; CHECK-NEXT: .cfi_offset b9, -48 +; CHECK-NEXT: .cfi_offset b10, -56 +; CHECK-NEXT: .cfi_offset b11, -64 +; CHECK-NEXT: .cfi_offset b12, -72 +; CHECK-NEXT: .cfi_offset b13, -80 +; CHECK-NEXT: .cfi_offset b14, -88 +; CHECK-NEXT: .cfi_offset b15, -96 ; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x8, x1 ; CHECK-NEXT: mov x9, x0 ; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: .cfi_offset vg, -24 ; CHECK-NEXT: tbz w19, #0, .LBB10_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: smstop sm @@ -483,12 +507,25 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr ; CHECK-NEXT: // %bb.3: // %entry ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB10_4: // %entry -; CHECK-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: .cfi_restore vg ; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #112] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #112 +; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 ; CHECK-NEXT: ret entry: call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll index 465fb466..4321493 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme -verify-machineinstrs < %s | FileCheck %s ; This file tests the following combinations related to streaming-enabled functions: ; [ ] N -> S (Normal -> Streaming) @@ -22,10 +22,11 @@ define void @normal_caller_streaming_callee() nounwind { ; CHECK-LABEL: normal_caller_streaming_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: bl streaming_callee ; CHECK-NEXT: smstop sm @@ -47,10 +48,11 @@ define void @streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_enable ; CHECK-LABEL: streaming_caller_normal_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl normal_callee ; CHECK-NEXT: smstart sm @@ -103,10 +105,11 @@ define void @call_to_function_pointer_streaming_enabled(ptr %p) nounwind { ; CHECK-LABEL: call_to_function_pointer_streaming_enabled: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: blr x0 ; CHECK-NEXT: smstop sm @@ -125,19 +128,20 @@ define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind { ; CHECK-LABEL: smstart_clobber_simdfp: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: bl streaming_callee ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 @@ -150,7 +154,9 @@ define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind { define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind { ; CHECK-LABEL: smstart_clobber_sve: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-18 ; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill @@ -216,7 +222,7 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind { ; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #18 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @streaming_callee() ret <vscale x 4 x i32> %x; @@ -227,7 +233,9 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind { define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x) nounwind { ; CHECK-LABEL: smstart_clobber_sve_duplicate: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-18 ; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill @@ -296,7 +304,7 @@ define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x) ; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #18 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @streaming_callee() call void @streaming_callee() @@ -308,11 +316,12 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta ; CHECK-LABEL: call_to_intrinsic_without_chain: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: stp d0, d0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp] // 8-byte Folded Reload @@ -320,11 +329,11 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta ; CHECK-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldp d1, d0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: fadd d0, d1, d0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: fadd d0, d1, d0 ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret @@ -342,10 +351,11 @@ define void @disable_tailcallopt() nounwind { ; CHECK-LABEL: disable_tailcallopt: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: bl streaming_callee ; CHECK-NEXT: smstop sm @@ -362,11 +372,13 @@ define void @disable_tailcallopt() nounwind { define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone %ptr) #0 { ; CHECK-LABEL: call_to_non_streaming_pass_sve_objects: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: rdsvl x3, #1 ; CHECK-NEXT: addvl x0, sp, #2 @@ -383,7 +395,7 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret entry: %Data1 = alloca <vscale x 16 x i8>, align 16 @@ -400,11 +412,12 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr ; CHECK-LABEL: call_to_non_streaming_pass_args: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill ; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll index 45ca784..ac19bd5 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll @@ -15,11 +15,12 @@ define void @test_no_stackslot_scavenging(float %f) #0 { ; CHECK-LABEL: test_no_stackslot_scavenging: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: stp x30, x24, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x24, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill @@ -31,8 +32,8 @@ define void @test_no_stackslot_scavenging(float %f) #0 { ; CHECK-NEXT: smstart sm ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x30, x24, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x24, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -46,4 +47,4 @@ define void @test_no_stackslot_scavenging(float %f) #0 { declare void @use_f(float) -attributes #0 = { nounwind "target-features"="+sme" "aarch64_pstate_sm_enabled" } +attributes #0 = { nounwind "target-features"="+sve,+sme" "aarch64_pstate_sm_enabled" } diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll new file mode 100644 index 0000000..6264ce0 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll @@ -0,0 +1,1177 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=FP-CHECK +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=NO-SVE-CHECK +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs -enable-machine-outliner < %s | FileCheck %s --check-prefix=OUTLINER-CHECK + +declare void @callee(); +declare void @fixed_callee(<4 x i32>); +declare void @scalable_callee(<vscale x 2 x i64>); + +declare void @streaming_callee() #0; +declare void @streaming_callee_with_arg(i32) #0; + +; Simple example of a function with one call requiring a streaming mode change +; +define void @vg_unwind_simple() #0 { +; CHECK-LABEL: vg_unwind_simple: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: .cfi_offset b8, -24 +; CHECK-NEXT: .cfi_offset b9, -32 +; CHECK-NEXT: .cfi_offset b10, -40 +; CHECK-NEXT: .cfi_offset b11, -48 +; CHECK-NEXT: .cfi_offset b12, -56 +; CHECK-NEXT: .cfi_offset b13, -64 +; CHECK-NEXT: .cfi_offset b14, -72 +; CHECK-NEXT: .cfi_offset b15, -80 +; CHECK-NEXT: .cfi_offset vg, -8 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +; +; FP-CHECK-LABEL: vg_unwind_simple: +; FP-CHECK: // %bb.0: +; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; FP-CHECK-NEXT: .cfi_def_cfa_offset 96 +; FP-CHECK-NEXT: cntd x9 +; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; FP-CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; FP-CHECK-NEXT: add x29, sp, #64 +; FP-CHECK-NEXT: .cfi_def_cfa w29, 32 +; FP-CHECK-NEXT: .cfi_offset w30, -24 +; FP-CHECK-NEXT: .cfi_offset w29, -32 +; FP-CHECK-NEXT: .cfi_offset b8, -40 +; FP-CHECK-NEXT: .cfi_offset b9, -48 +; FP-CHECK-NEXT: .cfi_offset b10, -56 +; FP-CHECK-NEXT: .cfi_offset b11, -64 +; FP-CHECK-NEXT: .cfi_offset b12, -72 +; FP-CHECK-NEXT: .cfi_offset b13, -80 +; FP-CHECK-NEXT: .cfi_offset b14, -88 +; FP-CHECK-NEXT: .cfi_offset b15, -96 +; FP-CHECK-NEXT: .cfi_offset vg, -16 +; FP-CHECK-NEXT: smstop sm +; FP-CHECK-NEXT: bl callee +; FP-CHECK-NEXT: smstart sm +; FP-CHECK-NEXT: .cfi_restore vg +; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96 +; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; FP-CHECK-NEXT: .cfi_def_cfa_offset 0 +; FP-CHECK-NEXT: .cfi_restore w30 +; FP-CHECK-NEXT: .cfi_restore w29 +; FP-CHECK-NEXT: .cfi_restore b8 +; FP-CHECK-NEXT: .cfi_restore b9 +; FP-CHECK-NEXT: .cfi_restore b10 +; FP-CHECK-NEXT: .cfi_restore b11 +; FP-CHECK-NEXT: .cfi_restore b12 +; FP-CHECK-NEXT: .cfi_restore b13 +; FP-CHECK-NEXT: .cfi_restore b14 +; FP-CHECK-NEXT: .cfi_restore b15 +; FP-CHECK-NEXT: ret +; +; OUTLINER-CHECK-LABEL: vg_unwind_simple: +; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ +; + call void @callee(); + ret void; +} + +; As above, with an extra register clobbered by the inline asm call which +; changes NeedsGapToAlignStack to false +; +define void @vg_unwind_needs_gap() #0 { +; CHECK-LABEL: vg_unwind_needs_gap: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x20, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: .cfi_offset b8, -40 +; CHECK-NEXT: .cfi_offset b9, -48 +; CHECK-NEXT: .cfi_offset b10, -56 +; CHECK-NEXT: .cfi_offset b11, -64 +; CHECK-NEXT: .cfi_offset b12, -72 +; CHECK-NEXT: .cfi_offset b13, -80 +; CHECK-NEXT: .cfi_offset b14, -88 +; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: .cfi_offset vg, -24 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x20, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w20 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +; +; FP-CHECK-LABEL: vg_unwind_needs_gap: +; FP-CHECK: // %bb.0: +; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; FP-CHECK-NEXT: .cfi_def_cfa_offset 96 +; FP-CHECK-NEXT: cntd x9 +; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp x9, x20, [sp, #80] // 16-byte Folded Spill +; FP-CHECK-NEXT: add x29, sp, #64 +; FP-CHECK-NEXT: .cfi_def_cfa w29, 32 +; FP-CHECK-NEXT: .cfi_offset w20, -8 +; FP-CHECK-NEXT: .cfi_offset w30, -24 +; FP-CHECK-NEXT: .cfi_offset w29, -32 +; FP-CHECK-NEXT: .cfi_offset b8, -40 +; FP-CHECK-NEXT: .cfi_offset b9, -48 +; FP-CHECK-NEXT: .cfi_offset b10, -56 +; FP-CHECK-NEXT: .cfi_offset b11, -64 +; FP-CHECK-NEXT: .cfi_offset b12, -72 +; FP-CHECK-NEXT: .cfi_offset b13, -80 +; FP-CHECK-NEXT: .cfi_offset b14, -88 +; FP-CHECK-NEXT: .cfi_offset b15, -96 +; FP-CHECK-NEXT: //APP +; FP-CHECK-NEXT: //NO_APP +; FP-CHECK-NEXT: .cfi_offset vg, -16 +; FP-CHECK-NEXT: smstop sm +; FP-CHECK-NEXT: bl callee +; FP-CHECK-NEXT: smstart sm +; FP-CHECK-NEXT: .cfi_restore vg +; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96 +; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr x20, [sp, #88] // 8-byte Folded Reload +; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; FP-CHECK-NEXT: .cfi_def_cfa_offset 0 +; FP-CHECK-NEXT: .cfi_restore w20 +; FP-CHECK-NEXT: .cfi_restore w30 +; FP-CHECK-NEXT: .cfi_restore w29 +; FP-CHECK-NEXT: .cfi_restore b8 +; FP-CHECK-NEXT: .cfi_restore b9 +; FP-CHECK-NEXT: .cfi_restore b10 +; FP-CHECK-NEXT: .cfi_restore b11 +; FP-CHECK-NEXT: .cfi_restore b12 +; FP-CHECK-NEXT: .cfi_restore b13 +; FP-CHECK-NEXT: .cfi_restore b14 +; FP-CHECK-NEXT: .cfi_restore b15 +; FP-CHECK-NEXT: ret +; +; OUTLINER-CHECK-LABEL: vg_unwind_needs_gap: +; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ +; + call void asm sideeffect "", "~{x20}"() + call void @callee(); + ret void; +} + +define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 { +; CHECK-LABEL: vg_unwind_with_fixed_args: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: .cfi_offset b8, -24 +; CHECK-NEXT: .cfi_offset b9, -32 +; CHECK-NEXT: .cfi_offset b10, -40 +; CHECK-NEXT: .cfi_offset b11, -48 +; CHECK-NEXT: .cfi_offset b12, -56 +; CHECK-NEXT: .cfi_offset b13, -64 +; CHECK-NEXT: .cfi_offset b14, -72 +; CHECK-NEXT: .cfi_offset b15, -80 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset vg, -8 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl fixed_callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +; +; FP-CHECK-LABEL: vg_unwind_with_fixed_args: +; FP-CHECK: // %bb.0: +; FP-CHECK-NEXT: sub sp, sp, #112 +; FP-CHECK-NEXT: .cfi_def_cfa_offset 112 +; FP-CHECK-NEXT: cntd x9 +; FP-CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp x29, x30, [sp, #80] // 16-byte Folded Spill +; FP-CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill +; FP-CHECK-NEXT: add x29, sp, #80 +; FP-CHECK-NEXT: .cfi_def_cfa w29, 32 +; FP-CHECK-NEXT: .cfi_offset w30, -24 +; FP-CHECK-NEXT: .cfi_offset w29, -32 +; FP-CHECK-NEXT: .cfi_offset b8, -40 +; FP-CHECK-NEXT: .cfi_offset b9, -48 +; FP-CHECK-NEXT: .cfi_offset b10, -56 +; FP-CHECK-NEXT: .cfi_offset b11, -64 +; FP-CHECK-NEXT: .cfi_offset b12, -72 +; FP-CHECK-NEXT: .cfi_offset b13, -80 +; FP-CHECK-NEXT: .cfi_offset b14, -88 +; FP-CHECK-NEXT: .cfi_offset b15, -96 +; FP-CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; FP-CHECK-NEXT: .cfi_offset vg, -16 +; FP-CHECK-NEXT: smstop sm +; FP-CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; FP-CHECK-NEXT: bl fixed_callee +; FP-CHECK-NEXT: smstart sm +; FP-CHECK-NEXT: .cfi_restore vg +; FP-CHECK-NEXT: .cfi_def_cfa wsp, 112 +; FP-CHECK-NEXT: ldp x29, x30, [sp, #80] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; FP-CHECK-NEXT: add sp, sp, #112 +; FP-CHECK-NEXT: .cfi_def_cfa_offset 0 +; FP-CHECK-NEXT: .cfi_restore w30 +; FP-CHECK-NEXT: .cfi_restore w29 +; FP-CHECK-NEXT: .cfi_restore b8 +; FP-CHECK-NEXT: .cfi_restore b9 +; FP-CHECK-NEXT: .cfi_restore b10 +; FP-CHECK-NEXT: .cfi_restore b11 +; FP-CHECK-NEXT: .cfi_restore b12 +; FP-CHECK-NEXT: .cfi_restore b13 +; FP-CHECK-NEXT: .cfi_restore b14 +; FP-CHECK-NEXT: .cfi_restore b15 +; FP-CHECK-NEXT: ret +; +; OUTLINER-CHECK-LABEL: vg_unwind_with_fixed_args: +; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ +; + call void @fixed_callee(<4 x i32> %x); + ret void; +} + +define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 { +; CHECK-LABEL: vg_unwind_with_sve_args: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset w28, -8 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: addvl sp, sp, #-18 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG +; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 32 - 32 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 32 - 40 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 32 - 48 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 32 - 56 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 32 - 64 * VG +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 152 * VG +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: .cfi_offset vg, -16 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl scalable_callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #18 +; CHECK-NEXT: .cfi_def_cfa wsp, 32 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: .cfi_restore z9 +; CHECK-NEXT: .cfi_restore z10 +; CHECK-NEXT: .cfi_restore z11 +; CHECK-NEXT: .cfi_restore z12 +; CHECK-NEXT: .cfi_restore z13 +; CHECK-NEXT: .cfi_restore z14 +; CHECK-NEXT: .cfi_restore z15 +; CHECK-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +; +; FP-CHECK-LABEL: vg_unwind_with_sve_args: +; FP-CHECK: // %bb.0: +; FP-CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; FP-CHECK-NEXT: .cfi_def_cfa_offset 48 +; FP-CHECK-NEXT: cntd x9 +; FP-CHECK-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; FP-CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; FP-CHECK-NEXT: mov x29, sp +; FP-CHECK-NEXT: .cfi_def_cfa w29, 48 +; FP-CHECK-NEXT: .cfi_offset w27, -8 +; FP-CHECK-NEXT: .cfi_offset w28, -16 +; FP-CHECK-NEXT: .cfi_offset w30, -40 +; FP-CHECK-NEXT: .cfi_offset w29, -48 +; FP-CHECK-NEXT: addvl sp, sp, #-18 +; FP-CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; FP-CHECK-NEXT: ptrue pn8.b +; FP-CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; FP-CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; FP-CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; FP-CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; FP-CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; FP-CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; FP-CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG +; FP-CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG +; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG +; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 48 - 32 * VG +; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 48 - 40 * VG +; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 48 - 48 * VG +; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG +; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG +; FP-CHECK-NEXT: addvl sp, sp, #-1 +; FP-CHECK-NEXT: str z0, [x29, #-19, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: //APP +; FP-CHECK-NEXT: //NO_APP +; FP-CHECK-NEXT: .cfi_offset vg, -32 +; FP-CHECK-NEXT: smstop sm +; FP-CHECK-NEXT: ldr z0, [x29, #-19, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: bl scalable_callee +; FP-CHECK-NEXT: smstart sm +; FP-CHECK-NEXT: .cfi_restore vg +; FP-CHECK-NEXT: addvl sp, sp, #1 +; FP-CHECK-NEXT: ptrue pn8.b +; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; FP-CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; FP-CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; FP-CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; FP-CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; FP-CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; FP-CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; FP-CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; FP-CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; FP-CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; FP-CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; FP-CHECK-NEXT: addvl sp, sp, #18 +; FP-CHECK-NEXT: .cfi_restore z8 +; FP-CHECK-NEXT: .cfi_restore z9 +; FP-CHECK-NEXT: .cfi_restore z10 +; FP-CHECK-NEXT: .cfi_restore z11 +; FP-CHECK-NEXT: .cfi_restore z12 +; FP-CHECK-NEXT: .cfi_restore z13 +; FP-CHECK-NEXT: .cfi_restore z14 +; FP-CHECK-NEXT: .cfi_restore z15 +; FP-CHECK-NEXT: .cfi_def_cfa wsp, 48 +; FP-CHECK-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; FP-CHECK-NEXT: .cfi_def_cfa_offset 0 +; FP-CHECK-NEXT: .cfi_restore w27 +; FP-CHECK-NEXT: .cfi_restore w28 +; FP-CHECK-NEXT: .cfi_restore w30 +; FP-CHECK-NEXT: .cfi_restore w29 +; FP-CHECK-NEXT: ret +; +; OUTLINER-CHECK-LABEL: vg_unwind_with_sve_args: +; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ +; + call void asm sideeffect "", "~{x28}"() + call void @scalable_callee(<vscale x 2 x i64> %x); + ret void; +} + +; This test was based on stack-probing-64k.ll and tries to test multiple uses of +; findScratchNonCalleeSaveRegister. +; +define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 { +; CHECK-LABEL: vg_unwind_multiple_scratch_regs: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_offset b8, -40 +; CHECK-NEXT: .cfi_offset b9, -48 +; CHECK-NEXT: .cfi_offset b10, -56 +; CHECK-NEXT: .cfi_offset b11, -64 +; CHECK-NEXT: .cfi_offset b12, -72 +; CHECK-NEXT: .cfi_offset b13, -80 +; CHECK-NEXT: .cfi_offset b14, -88 +; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa w9, 327776 +; CHECK-NEXT: .LBB4_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b.ne .LBB4_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: .cfi_offset vg, -16 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +; +; FP-CHECK-LABEL: vg_unwind_multiple_scratch_regs: +; FP-CHECK: // %bb.0: // %entry +; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; FP-CHECK-NEXT: .cfi_def_cfa_offset 96 +; FP-CHECK-NEXT: cntd x9 +; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill +; FP-CHECK-NEXT: add x29, sp, #64 +; FP-CHECK-NEXT: .cfi_def_cfa w29, 32 +; FP-CHECK-NEXT: .cfi_offset w28, -8 +; FP-CHECK-NEXT: .cfi_offset w30, -24 +; FP-CHECK-NEXT: .cfi_offset w29, -32 +; FP-CHECK-NEXT: .cfi_offset b8, -40 +; FP-CHECK-NEXT: .cfi_offset b9, -48 +; FP-CHECK-NEXT: .cfi_offset b10, -56 +; FP-CHECK-NEXT: .cfi_offset b11, -64 +; FP-CHECK-NEXT: .cfi_offset b12, -72 +; FP-CHECK-NEXT: .cfi_offset b13, -80 +; FP-CHECK-NEXT: .cfi_offset b14, -88 +; FP-CHECK-NEXT: .cfi_offset b15, -96 +; FP-CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680 +; FP-CHECK-NEXT: .LBB4_1: // %entry +; FP-CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; FP-CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; FP-CHECK-NEXT: cmp sp, x9 +; FP-CHECK-NEXT: str xzr, [sp] +; FP-CHECK-NEXT: b.ne .LBB4_1 +; FP-CHECK-NEXT: // %bb.2: // %entry +; FP-CHECK-NEXT: mov x8, sp +; FP-CHECK-NEXT: str x8, [x0] +; FP-CHECK-NEXT: .cfi_offset vg, -16 +; FP-CHECK-NEXT: smstop sm +; FP-CHECK-NEXT: bl callee +; FP-CHECK-NEXT: smstart sm +; FP-CHECK-NEXT: .cfi_restore vg +; FP-CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 +; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96 +; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload +; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; FP-CHECK-NEXT: .cfi_def_cfa_offset 0 +; FP-CHECK-NEXT: .cfi_restore w28 +; FP-CHECK-NEXT: .cfi_restore w30 +; FP-CHECK-NEXT: .cfi_restore w29 +; FP-CHECK-NEXT: .cfi_restore b8 +; FP-CHECK-NEXT: .cfi_restore b9 +; FP-CHECK-NEXT: .cfi_restore b10 +; FP-CHECK-NEXT: .cfi_restore b11 +; FP-CHECK-NEXT: .cfi_restore b12 +; FP-CHECK-NEXT: .cfi_restore b13 +; FP-CHECK-NEXT: .cfi_restore b14 +; FP-CHECK-NEXT: .cfi_restore b15 +; FP-CHECK-NEXT: ret +; +; OUTLINER-CHECK-LABEL: vg_unwind_multiple_scratch_regs: +; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ +; +entry: + %v = alloca i8, i64 327680, align 1 + store ptr %v, ptr %out, align 8 + call void @callee() + ret void +} + +; Locally streaming functions require storing both the streaming and +; non-streaming values of VG. +; +define void @vg_locally_streaming_fn() #3 { +; CHECK-LABEL: vg_locally_streaming_fn: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: lsr x9, x9, #3 +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset vg, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: .cfi_offset b8, -40 +; CHECK-NEXT: .cfi_offset b9, -48 +; CHECK-NEXT: .cfi_offset b10, -56 +; CHECK-NEXT: .cfi_offset b11, -64 +; CHECK-NEXT: .cfi_offset b12, -72 +; CHECK-NEXT: .cfi_offset b13, -80 +; CHECK-NEXT: .cfi_offset b14, -88 +; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .cfi_offset vg, -24 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: .cfi_offset vg, -24 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +; +; FP-CHECK-LABEL: vg_locally_streaming_fn: +; FP-CHECK: // %bb.0: +; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; FP-CHECK-NEXT: .cfi_def_cfa_offset 96 +; FP-CHECK-NEXT: rdsvl x9, #1 +; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; FP-CHECK-NEXT: lsr x9, x9, #3 +; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; FP-CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; FP-CHECK-NEXT: cntd x9 +; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; FP-CHECK-NEXT: str x9, [sp, #88] // 8-byte Folded Spill +; FP-CHECK-NEXT: add x29, sp, #64 +; FP-CHECK-NEXT: .cfi_def_cfa w29, 32 +; FP-CHECK-NEXT: .cfi_offset vg, -8 +; FP-CHECK-NEXT: .cfi_offset w30, -24 +; FP-CHECK-NEXT: .cfi_offset w29, -32 +; FP-CHECK-NEXT: .cfi_offset b8, -40 +; FP-CHECK-NEXT: .cfi_offset b9, -48 +; FP-CHECK-NEXT: .cfi_offset b10, -56 +; FP-CHECK-NEXT: .cfi_offset b11, -64 +; FP-CHECK-NEXT: .cfi_offset b12, -72 +; FP-CHECK-NEXT: .cfi_offset b13, -80 +; FP-CHECK-NEXT: .cfi_offset b14, -88 +; FP-CHECK-NEXT: .cfi_offset b15, -96 +; FP-CHECK-NEXT: smstart sm +; FP-CHECK-NEXT: .cfi_offset vg, -16 +; FP-CHECK-NEXT: smstop sm +; FP-CHECK-NEXT: bl callee +; FP-CHECK-NEXT: smstart sm +; FP-CHECK-NEXT: .cfi_restore vg +; FP-CHECK-NEXT: bl streaming_callee +; FP-CHECK-NEXT: .cfi_offset vg, -16 +; FP-CHECK-NEXT: smstop sm +; FP-CHECK-NEXT: bl callee +; FP-CHECK-NEXT: smstart sm +; FP-CHECK-NEXT: .cfi_restore vg +; FP-CHECK-NEXT: smstop sm +; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96 +; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; FP-CHECK-NEXT: .cfi_def_cfa_offset 0 +; FP-CHECK-NEXT: .cfi_restore w30 +; FP-CHECK-NEXT: .cfi_restore w29 +; FP-CHECK-NEXT: .cfi_restore b8 +; FP-CHECK-NEXT: .cfi_restore b9 +; FP-CHECK-NEXT: .cfi_restore b10 +; FP-CHECK-NEXT: .cfi_restore b11 +; FP-CHECK-NEXT: .cfi_restore b12 +; FP-CHECK-NEXT: .cfi_restore b13 +; FP-CHECK-NEXT: .cfi_restore b14 +; FP-CHECK-NEXT: .cfi_restore b15 +; FP-CHECK-NEXT: ret +; +; OUTLINER-CHECK-LABEL: vg_locally_streaming_fn: +; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ +; + call void @callee() + call void @streaming_callee() + call void @callee() + ret void +} + +define void @streaming_compatible_to_streaming() #4 { +; CHECK-LABEL: streaming_compatible_to_streaming: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: .cfi_offset b8, -40 +; CHECK-NEXT: .cfi_offset b9, -48 +; CHECK-NEXT: .cfi_offset b10, -56 +; CHECK-NEXT: .cfi_offset b11, -64 +; CHECK-NEXT: .cfi_offset b12, -72 +; CHECK-NEXT: .cfi_offset b13, -80 +; CHECK-NEXT: .cfi_offset b14, -88 +; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: .cfi_offset vg, -24 +; CHECK-NEXT: tbnz w19, #0, .LBB6_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB6_2: +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: tbnz w19, #0, .LBB6_4 +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB6_4: +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +; +; FP-CHECK-LABEL: streaming_compatible_to_streaming: +; FP-CHECK: // %bb.0: +; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; FP-CHECK-NEXT: .cfi_def_cfa_offset 96 +; FP-CHECK-NEXT: cntd x9 +; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; FP-CHECK-NEXT: add x29, sp, #64 +; FP-CHECK-NEXT: .cfi_def_cfa w29, 32 +; FP-CHECK-NEXT: .cfi_offset w19, -8 +; FP-CHECK-NEXT: .cfi_offset w30, -24 +; FP-CHECK-NEXT: .cfi_offset w29, -32 +; FP-CHECK-NEXT: .cfi_offset b8, -40 +; FP-CHECK-NEXT: .cfi_offset b9, -48 +; FP-CHECK-NEXT: .cfi_offset b10, -56 +; FP-CHECK-NEXT: .cfi_offset b11, -64 +; FP-CHECK-NEXT: .cfi_offset b12, -72 +; FP-CHECK-NEXT: .cfi_offset b13, -80 +; FP-CHECK-NEXT: .cfi_offset b14, -88 +; FP-CHECK-NEXT: .cfi_offset b15, -96 +; FP-CHECK-NEXT: bl __arm_sme_state +; FP-CHECK-NEXT: and x19, x0, #0x1 +; FP-CHECK-NEXT: .cfi_offset vg, -16 +; FP-CHECK-NEXT: tbnz w19, #0, .LBB6_2 +; FP-CHECK-NEXT: // %bb.1: +; FP-CHECK-NEXT: smstart sm +; FP-CHECK-NEXT: .LBB6_2: +; FP-CHECK-NEXT: bl streaming_callee +; FP-CHECK-NEXT: tbnz w19, #0, .LBB6_4 +; FP-CHECK-NEXT: // %bb.3: +; FP-CHECK-NEXT: smstop sm +; FP-CHECK-NEXT: .LBB6_4: +; FP-CHECK-NEXT: .cfi_restore vg +; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96 +; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; FP-CHECK-NEXT: .cfi_def_cfa_offset 0 +; FP-CHECK-NEXT: .cfi_restore w19 +; FP-CHECK-NEXT: .cfi_restore w30 +; FP-CHECK-NEXT: .cfi_restore w29 +; FP-CHECK-NEXT: .cfi_restore b8 +; FP-CHECK-NEXT: .cfi_restore b9 +; FP-CHECK-NEXT: .cfi_restore b10 +; FP-CHECK-NEXT: .cfi_restore b11 +; FP-CHECK-NEXT: .cfi_restore b12 +; FP-CHECK-NEXT: .cfi_restore b13 +; FP-CHECK-NEXT: .cfi_restore b14 +; FP-CHECK-NEXT: .cfi_restore b15 +; FP-CHECK-NEXT: ret +; +; OUTLINER-CHECK-LABEL: streaming_compatible_to_streaming: +; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ +; + call void @streaming_callee() + ret void +} + +define void @streaming_compatible_to_non_streaming() #4 { +; CHECK-LABEL: streaming_compatible_to_non_streaming: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: .cfi_offset b8, -40 +; CHECK-NEXT: .cfi_offset b9, -48 +; CHECK-NEXT: .cfi_offset b10, -56 +; CHECK-NEXT: .cfi_offset b11, -64 +; CHECK-NEXT: .cfi_offset b12, -72 +; CHECK-NEXT: .cfi_offset b13, -80 +; CHECK-NEXT: .cfi_offset b14, -88 +; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: .cfi_offset vg, -24 +; CHECK-NEXT: tbz w19, #0, .LBB7_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB7_2: +; CHECK-NEXT: bl callee +; CHECK-NEXT: tbz w19, #0, .LBB7_4 +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB7_4: +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +; +; FP-CHECK-LABEL: streaming_compatible_to_non_streaming: +; FP-CHECK: // %bb.0: +; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; FP-CHECK-NEXT: .cfi_def_cfa_offset 96 +; FP-CHECK-NEXT: cntd x9 +; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; FP-CHECK-NEXT: add x29, sp, #64 +; FP-CHECK-NEXT: .cfi_def_cfa w29, 32 +; FP-CHECK-NEXT: .cfi_offset w19, -8 +; FP-CHECK-NEXT: .cfi_offset w30, -24 +; FP-CHECK-NEXT: .cfi_offset w29, -32 +; FP-CHECK-NEXT: .cfi_offset b8, -40 +; FP-CHECK-NEXT: .cfi_offset b9, -48 +; FP-CHECK-NEXT: .cfi_offset b10, -56 +; FP-CHECK-NEXT: .cfi_offset b11, -64 +; FP-CHECK-NEXT: .cfi_offset b12, -72 +; FP-CHECK-NEXT: .cfi_offset b13, -80 +; FP-CHECK-NEXT: .cfi_offset b14, -88 +; FP-CHECK-NEXT: .cfi_offset b15, -96 +; FP-CHECK-NEXT: bl __arm_sme_state +; FP-CHECK-NEXT: and x19, x0, #0x1 +; FP-CHECK-NEXT: .cfi_offset vg, -16 +; FP-CHECK-NEXT: tbz w19, #0, .LBB7_2 +; FP-CHECK-NEXT: // %bb.1: +; FP-CHECK-NEXT: smstop sm +; FP-CHECK-NEXT: .LBB7_2: +; FP-CHECK-NEXT: bl callee +; FP-CHECK-NEXT: tbz w19, #0, .LBB7_4 +; FP-CHECK-NEXT: // %bb.3: +; FP-CHECK-NEXT: smstart sm +; FP-CHECK-NEXT: .LBB7_4: +; FP-CHECK-NEXT: .cfi_restore vg +; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96 +; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; FP-CHECK-NEXT: .cfi_def_cfa_offset 0 +; FP-CHECK-NEXT: .cfi_restore w19 +; FP-CHECK-NEXT: .cfi_restore w30 +; FP-CHECK-NEXT: .cfi_restore w29 +; FP-CHECK-NEXT: .cfi_restore b8 +; FP-CHECK-NEXT: .cfi_restore b9 +; FP-CHECK-NEXT: .cfi_restore b10 +; FP-CHECK-NEXT: .cfi_restore b11 +; FP-CHECK-NEXT: .cfi_restore b12 +; FP-CHECK-NEXT: .cfi_restore b13 +; FP-CHECK-NEXT: .cfi_restore b14 +; FP-CHECK-NEXT: .cfi_restore b15 +; FP-CHECK-NEXT: ret +; +; OUTLINER-CHECK-LABEL: streaming_compatible_to_non_streaming: +; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ +; + call void @callee() + ret void +} + +; If the target does not have SVE, do not emit cntd in the prologue and +; instead spill the result returned by __arm_get_current_vg. +; This requires preserving the argument %x as the vg value is returned +; in X0. +; +define void @streaming_compatible_no_sve(i32 noundef %x) #4 { +; NO-SVE-CHECK-LABEL: streaming_compatible_no_sve: +; NO-SVE-CHECK: // %bb.0: +; NO-SVE-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; NO-SVE-CHECK-NEXT: .cfi_def_cfa_offset 96 +; NO-SVE-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; NO-SVE-CHECK-NEXT: mov x9, x0 +; NO-SVE-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; NO-SVE-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; NO-SVE-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; NO-SVE-CHECK-NEXT: bl __arm_get_current_vg +; NO-SVE-CHECK-NEXT: stp x0, x19, [sp, #80] // 16-byte Folded Spill +; NO-SVE-CHECK-NEXT: mov x0, x9 +; NO-SVE-CHECK-NEXT: add x29, sp, #64 +; NO-SVE-CHECK-NEXT: .cfi_def_cfa w29, 32 +; NO-SVE-CHECK-NEXT: .cfi_offset w19, -8 +; NO-SVE-CHECK-NEXT: .cfi_offset w30, -24 +; NO-SVE-CHECK-NEXT: .cfi_offset w29, -32 +; NO-SVE-CHECK-NEXT: .cfi_offset b8, -40 +; NO-SVE-CHECK-NEXT: .cfi_offset b9, -48 +; NO-SVE-CHECK-NEXT: .cfi_offset b10, -56 +; NO-SVE-CHECK-NEXT: .cfi_offset b11, -64 +; NO-SVE-CHECK-NEXT: .cfi_offset b12, -72 +; NO-SVE-CHECK-NEXT: .cfi_offset b13, -80 +; NO-SVE-CHECK-NEXT: .cfi_offset b14, -88 +; NO-SVE-CHECK-NEXT: .cfi_offset b15, -96 +; NO-SVE-CHECK-NEXT: mov w8, w0 +; NO-SVE-CHECK-NEXT: bl __arm_sme_state +; NO-SVE-CHECK-NEXT: and x19, x0, #0x1 +; NO-SVE-CHECK-NEXT: .cfi_offset vg, -16 +; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB8_2 +; NO-SVE-CHECK-NEXT: // %bb.1: +; NO-SVE-CHECK-NEXT: smstart sm +; NO-SVE-CHECK-NEXT: .LBB8_2: +; NO-SVE-CHECK-NEXT: mov w0, w8 +; NO-SVE-CHECK-NEXT: bl streaming_callee_with_arg +; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB8_4 +; NO-SVE-CHECK-NEXT: // %bb.3: +; NO-SVE-CHECK-NEXT: smstop sm +; NO-SVE-CHECK-NEXT: .LBB8_4: +; NO-SVE-CHECK-NEXT: .cfi_restore vg +; NO-SVE-CHECK-NEXT: .cfi_def_cfa wsp, 96 +; NO-SVE-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; NO-SVE-CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; NO-SVE-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; NO-SVE-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; NO-SVE-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; NO-SVE-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; NO-SVE-CHECK-NEXT: .cfi_def_cfa_offset 0 +; NO-SVE-CHECK-NEXT: .cfi_restore w19 +; NO-SVE-CHECK-NEXT: .cfi_restore w30 +; NO-SVE-CHECK-NEXT: .cfi_restore w29 +; NO-SVE-CHECK-NEXT: .cfi_restore b8 +; NO-SVE-CHECK-NEXT: .cfi_restore b9 +; NO-SVE-CHECK-NEXT: .cfi_restore b10 +; NO-SVE-CHECK-NEXT: .cfi_restore b11 +; NO-SVE-CHECK-NEXT: .cfi_restore b12 +; NO-SVE-CHECK-NEXT: .cfi_restore b13 +; NO-SVE-CHECK-NEXT: .cfi_restore b14 +; NO-SVE-CHECK-NEXT: .cfi_restore b15 +; NO-SVE-CHECK-NEXT: ret +; +; OUTLINER-CHECK-LABEL: streaming_compatible_no_sve: +; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ +; + call void @streaming_callee_with_arg(i32 %x) + ret void +} + +; Ensure we still emit async unwind information with -fno-asynchronous-unwind-tables +; if the function contains a streaming-mode change. + +define void @vg_unwind_noasync() #5 { +; CHECK-LABEL: vg_unwind_noasync: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: .cfi_offset b8, -24 +; CHECK-NEXT: .cfi_offset b9, -32 +; CHECK-NEXT: .cfi_offset b10, -40 +; CHECK-NEXT: .cfi_offset b11, -48 +; CHECK-NEXT: .cfi_offset b12, -56 +; CHECK-NEXT: .cfi_offset b13, -64 +; CHECK-NEXT: .cfi_offset b14, -72 +; CHECK-NEXT: .cfi_offset b15, -80 +; CHECK-NEXT: .cfi_offset vg, -8 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +; +; FP-CHECK-LABEL: vg_unwind_noasync: +; FP-CHECK: // %bb.0: +; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; FP-CHECK-NEXT: .cfi_def_cfa_offset 96 +; FP-CHECK-NEXT: cntd x9 +; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; FP-CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; FP-CHECK-NEXT: add x29, sp, #64 +; FP-CHECK-NEXT: .cfi_def_cfa w29, 32 +; FP-CHECK-NEXT: .cfi_offset w30, -24 +; FP-CHECK-NEXT: .cfi_offset w29, -32 +; FP-CHECK-NEXT: .cfi_offset b8, -40 +; FP-CHECK-NEXT: .cfi_offset b9, -48 +; FP-CHECK-NEXT: .cfi_offset b10, -56 +; FP-CHECK-NEXT: .cfi_offset b11, -64 +; FP-CHECK-NEXT: .cfi_offset b12, -72 +; FP-CHECK-NEXT: .cfi_offset b13, -80 +; FP-CHECK-NEXT: .cfi_offset b14, -88 +; FP-CHECK-NEXT: .cfi_offset b15, -96 +; FP-CHECK-NEXT: .cfi_offset vg, -16 +; FP-CHECK-NEXT: smstop sm +; FP-CHECK-NEXT: bl callee +; FP-CHECK-NEXT: smstart sm +; FP-CHECK-NEXT: .cfi_restore vg +; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96 +; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; FP-CHECK-NEXT: .cfi_def_cfa_offset 0 +; FP-CHECK-NEXT: .cfi_restore w30 +; FP-CHECK-NEXT: .cfi_restore w29 +; FP-CHECK-NEXT: .cfi_restore b8 +; FP-CHECK-NEXT: .cfi_restore b9 +; FP-CHECK-NEXT: .cfi_restore b10 +; FP-CHECK-NEXT: .cfi_restore b11 +; FP-CHECK-NEXT: .cfi_restore b12 +; FP-CHECK-NEXT: .cfi_restore b13 +; FP-CHECK-NEXT: .cfi_restore b14 +; FP-CHECK-NEXT: .cfi_restore b15 +; FP-CHECK-NEXT: ret +; OUTLINER-CHECK-LABEL: vg_unwind_noasync: +; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_ +; + call void @callee(); + ret void; +} + +attributes #0 = { "aarch64_pstate_sm_enabled" uwtable(async) } +attributes #1 = { "probe-stack"="inline-asm" "aarch64_pstate_sm_enabled" uwtable(async) } +attributes #3 = { "aarch64_pstate_sm_body" uwtable(async) } +attributes #4 = { "aarch64_pstate_sm_compatible" uwtable(async) } +attributes #5 = { "aarch64_pstate_sm_enabled" } diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll index c39894c..106d619 100644 --- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll +++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -aarch64-lower-to-sme-routines=false < %s | FileCheck %s -check-prefixes=CHECK-NO-SME-ROUTINES -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+mops -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-MOPS +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs -aarch64-lower-to-sme-routines=false < %s | FileCheck %s -check-prefixes=CHECK-NO-SME-ROUTINES +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -mattr=+mops -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-MOPS @dst = global [512 x i8] zeroinitializer, align 1 @src = global [512 x i8] zeroinitializer, align 1 @@ -22,13 +22,14 @@ define void @se_memcpy(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind { ; CHECK-NO-SME-ROUTINES-LABEL: se_memcpy: ; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry ; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: cntd x9 ; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0 -; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst ; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst ; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src ; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst] ; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src] ; CHECK-NO-SME-ROUTINES-NEXT: smstop sm @@ -71,12 +72,13 @@ define void @se_memset(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind { ; CHECK-NO-SME-ROUTINES-LABEL: se_memset: ; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry ; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: cntd x9 ; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0 ; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst +; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst] ; CHECK-NO-SME-ROUTINES-NEXT: smstop sm ; CHECK-NO-SME-ROUTINES-NEXT: mov w1, #2 // =0x2 @@ -119,13 +121,14 @@ define void @se_memmove(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind { ; CHECK-NO-SME-ROUTINES-LABEL: se_memmove: ; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry ; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: cntd x9 ; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0 -; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst ; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst ; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src ; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst] ; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src] ; CHECK-NO-SME-ROUTINES-NEXT: smstop sm @@ -168,16 +171,18 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind { ; ; CHECK-NO-SME-ROUTINES-LABEL: sc_memcpy: ; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry -; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: cntd x9 ; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0 ; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: bl __arm_sme_state ; CHECK-NO-SME-ROUTINES-NEXT: adrp x8, :got:dst -; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src ; CHECK-NO-SME-ROUTINES-NEXT: and x19, x0, #0x1 +; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src ; CHECK-NO-SME-ROUTINES-NEXT: ldr x8, [x8, :got_lo12:dst] ; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src] ; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB3_2 @@ -190,11 +195,12 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind { ; CHECK-NO-SME-ROUTINES-NEXT: // %bb.3: // %entry ; CHECK-NO-SME-ROUTINES-NEXT: smstart sm ; CHECK-NO-SME-ROUTINES-NEXT: .LBB3_4: // %entry -; CHECK-NO-SME-ROUTINES-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NO-SME-ROUTINES-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NO-SME-ROUTINES-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NO-SME-ROUTINES-NEXT: ret ; ; CHECK-MOPS-LABEL: sc_memcpy: @@ -215,12 +221,16 @@ entry: define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: sb_memcpy: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x2, x0 +; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: adrp x0, :got:dst ; CHECK-NEXT: adrp x1, :got:src @@ -232,17 +242,21 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind { ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret ; ; CHECK-NO-SME-ROUTINES-LABEL: sb_memcpy: ; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry -; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: rdsvl x9, #1 ; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0 +; CHECK-NO-SME-ROUTINES-NEXT: lsr x9, x9, #3 ; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: cntd x9 +; CHECK-NO-SME-ROUTINES-NEXT: str x9, [sp, #80] // 8-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: smstart sm ; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst ; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src @@ -256,15 +270,20 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind { ; CHECK-NO-SME-ROUTINES-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NO-SME-ROUTINES-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NO-SME-ROUTINES-NEXT: ret ; ; CHECK-MOPS-LABEL: sb_memcpy: ; CHECK-MOPS: // %bb.0: // %entry -; CHECK-MOPS-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill -; CHECK-MOPS-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-MOPS-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-MOPS-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-MOPS-NEXT: rdsvl x9, #1 +; CHECK-MOPS-NEXT: lsr x9, x9, #3 +; CHECK-MOPS-NEXT: str x9, [sp, #-80]! // 8-byte Folded Spill +; CHECK-MOPS-NEXT: cntd x9 +; CHECK-MOPS-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-MOPS-NEXT: str x9, [sp, #8] // 8-byte Folded Spill +; CHECK-MOPS-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-MOPS-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-MOPS-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-MOPS-NEXT: smstart sm ; CHECK-MOPS-NEXT: adrp x8, :got:src ; CHECK-MOPS-NEXT: adrp x9, :got:dst @@ -274,10 +293,11 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind { ; CHECK-MOPS-NEXT: cpyfm [x9]!, [x8]!, x0! ; CHECK-MOPS-NEXT: cpyfe [x9]!, [x8]!, x0! ; CHECK-MOPS-NEXT: smstop sm -; CHECK-MOPS-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-MOPS-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-MOPS-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-MOPS-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-MOPS-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-MOPS-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-MOPS-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-MOPS-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-MOPS-NEXT: add sp, sp, #80 ; CHECK-MOPS-NEXT: ret entry: tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false) |