diff options
Diffstat (limited to 'llvm/lib/Target/AArch64')
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 492 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64FrameLowering.h | 30 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 16 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 62 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrFormats.td | 22 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 15 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp | 20 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h | 83 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp | 444 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp | 15 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 2 |
12 files changed, 821 insertions, 383 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 79655e1..0f4bbfc3 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1610,7 +1610,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, int BaseOffset = -AFI->getTaggedBasePointerOffset(); Register FrameReg; StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference( - MF, BaseOffset, false /*isFixed*/, false /*isSVE*/, FrameReg, + MF, BaseOffset, false /*isFixed*/, TargetStackID::Default /*StackID*/, + FrameReg, /*PreferFP=*/false, /*ForSimm=*/true); Register SrcReg = FrameReg; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index ab5c6f3..8d6eb91 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -56,15 +56,20 @@ // | async context if needed | // | (a.k.a. "frame record") | // |-----------------------------------| <- fp(=x29) -// | <hazard padding> | -// |-----------------------------------| -// | | -// | callee-saved fp/simd/SVE regs | -// | | -// |-----------------------------------| -// | | -// | SVE stack objects | -// | | +// Default SVE stack layout Split SVE objects +// (aarch64-split-sve-objects=false) (aarch64-split-sve-objects=true) +// |-----------------------------------| |-----------------------------------| +// | <hazard padding> | | callee-saved PPR registers | +// |-----------------------------------| |-----------------------------------| +// | | | PPR stack objects | +// | callee-saved fp/simd/SVE regs | |-----------------------------------| +// | | | <hazard padding> | +// |-----------------------------------| |-----------------------------------| +// | | | callee-saved ZPR/FPR registers | +// | SVE stack objects | |-----------------------------------| +// | | | ZPR stack objects | +// |-----------------------------------| |-----------------------------------| +// ^ NB: FPR CSRs are promoted to ZPRs // |-----------------------------------| // |.empty.space.to.make.part.below....| // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at @@ -274,6 +279,11 @@ static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects", cl::desc("sort stack allocations"), cl::init(true), cl::Hidden); +static cl::opt<bool> + SplitSVEObjects("aarch64-split-sve-objects", + cl::desc("Split allocation of ZPR & PPR objects"), + cl::init(false), cl::Hidden); + cl::opt<bool> EnableHomogeneousPrologEpilog( "homogeneous-prolog-epilog", cl::Hidden, cl::desc("Emit homogeneous prologue and epilogue for the size " @@ -324,7 +334,41 @@ AArch64FrameLowering::getArgumentStackToRestore(MachineFunction &MF, static bool produceCompactUnwindFrame(const AArch64FrameLowering &, MachineFunction &MF); -// Conservatively, returns true if the function is likely to have an SVE vectors +enum class AssignObjectOffsets { No, Yes }; +/// Process all the SVE stack objects and the SVE stack size and offsets for +/// each object. If AssignOffsets is "Yes", the offsets get assigned (and SVE +/// stack sizes set). Returns the size of the SVE stack. +static SVEStackSizes determineSVEStackSizes(MachineFunction &MF, + AssignObjectOffsets AssignOffsets); + +static unsigned getStackHazardSize(const MachineFunction &MF) { + return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize(); +} + +/// Returns true if PPRs are spilled as ZPRs. +static bool arePPRsSpilledAsZPR(const MachineFunction &MF) { + return MF.getSubtarget().getRegisterInfo()->getSpillSize( + AArch64::PPRRegClass) == 16; +} + +StackOffset +AArch64FrameLowering::getZPRStackSize(const MachineFunction &MF) const { + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + return StackOffset::getScalable(AFI->getStackSizeZPR()); +} + +StackOffset +AArch64FrameLowering::getPPRStackSize(const MachineFunction &MF) const { + // With split SVE objects, the hazard padding is added to the PPR region, + // which places it between the [GPR, PPR] area and the [ZPR, FPR] area. This + // avoids hazards between both GPRs and FPRs and ZPRs and PPRs. + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + return StackOffset::get(AFI->hasSplitSVEObjects() ? getStackHazardSize(MF) + : 0, + AFI->getStackSizePPR()); +} + +// Conservatively, returns true if the function is likely to have SVE vectors // on the stack. This function is safe to be called before callee-saves or // object offsets have been determined. static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL, @@ -338,7 +382,7 @@ static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL, const MachineFrameInfo &MFI = MF.getFrameInfo(); for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd(); FI++) { - if (MFI.getStackID(FI) == TargetStackID::ScalableVector) + if (MFI.hasScalableStackID(FI)) return true; } @@ -482,13 +526,6 @@ AArch64FrameLowering::getFixedObjectSize(const MachineFunction &MF, } } -/// Returns the size of the entire SVE stackframe (calleesaves + spills). -StackOffset -AArch64FrameLowering::getSVEStackSize(const MachineFunction &MF) const { - const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE()); -} - bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { if (!EnableRedZone) return false; @@ -514,7 +551,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { !Subtarget.hasSVE(); return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize || - getSVEStackSize(MF) || LowerQRegCopyThroughMem); + AFI->hasSVEStackSize() || LowerQRegCopyThroughMem); } /// hasFPImpl - Return true if the specified function should have a dedicated @@ -557,7 +594,7 @@ bool AArch64FrameLowering::hasFPImpl(const MachineFunction &MF) const { // CFA in either of these cases. if (AFI.needsDwarfUnwindInfo(MF) && ((requiresSaveVG(MF) || AFI.getSMEFnAttrs().hasStreamingBody()) && - (!AFI.hasCalculatedStackSizeSVE() || AFI.getStackSizeSVE() > 0))) + (!AFI.hasCalculatedStackSizeSVE() || AFI.hasSVEStackSize()))) return true; // With large callframes around we may need to use FP to access the scavenging // emergency spillslot. @@ -1126,10 +1163,6 @@ static bool isTargetWindows(const MachineFunction &MF) { return MF.getSubtarget<AArch64Subtarget>().isTargetWindows(); } -static unsigned getStackHazardSize(const MachineFunction &MF) { - return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize(); -} - void AArch64FrameLowering::emitPacRetPlusLeafHardening( MachineFunction &MF) const { const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); @@ -1212,7 +1245,9 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, const auto &MFI = MF.getFrameInfo(); int64_t ObjectOffset = MFI.getObjectOffset(FI); - StackOffset SVEStackSize = getSVEStackSize(MF); + StackOffset ZPRStackSize = getZPRStackSize(MF); + StackOffset PPRStackSize = getPPRStackSize(MF); + StackOffset SVEStackSize = ZPRStackSize + PPRStackSize; // For VLA-area objects, just emit an offset at the end of the stack frame. // Whilst not quite correct, these objects do live at the end of the frame and @@ -1228,11 +1263,21 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); bool FPAfterSVECalleeSaves = isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize(); - if (MFI.getStackID(FI) == TargetStackID::ScalableVector) { + if (MFI.hasScalableStackID(FI)) { if (FPAfterSVECalleeSaves && - -ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) + -ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) { + assert(!AFI->hasSplitSVEObjects() && + "split-sve-objects not supported with FPAfterSVECalleeSaves"); return StackOffset::getScalable(ObjectOffset); - return StackOffset::get(-((int64_t)AFI->getCalleeSavedStackSize()), + } + StackOffset AccessOffset{}; + // The scalable vectors are below (lower address) the scalable predicates + // with split SVE objects, so we must subtract the size of the predicates. + if (AFI->hasSplitSVEObjects() && + MFI.getStackID(FI) == TargetStackID::ScalableVector) + AccessOffset = -PPRStackSize; + return AccessOffset + + StackOffset::get(-((int64_t)AFI->getCalleeSavedStackSize()), ObjectOffset); } @@ -1294,14 +1339,15 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference( const auto &MFI = MF.getFrameInfo(); int64_t ObjectOffset = MFI.getObjectOffset(FI); bool isFixed = MFI.isFixedObjectIndex(FI); - bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector; - return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg, - PreferFP, ForSimm); + auto StackID = static_cast<TargetStackID::Value>(MFI.getStackID(FI)); + return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, StackID, + FrameReg, PreferFP, ForSimm); } StackOffset AArch64FrameLowering::resolveFrameOffsetReference( - const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE, - Register &FrameReg, bool PreferFP, bool ForSimm) const { + const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, + TargetStackID::Value StackID, Register &FrameReg, bool PreferFP, + bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); const auto *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); @@ -1312,8 +1358,11 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed(); bool isCSR = !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI)); + bool isSVE = MFI.isScalableStackID(StackID); - const StackOffset &SVEStackSize = getSVEStackSize(MF); + StackOffset ZPRStackSize = getZPRStackSize(MF); + StackOffset PPRStackSize = getPPRStackSize(MF); + StackOffset SVEStackSize = ZPRStackSize + PPRStackSize; // Use frame pointer to reference fixed objects. Use it for locals if // there are VLAs or a dynamically realigned SP (and thus the SP isn't @@ -1388,12 +1437,25 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize(); if (isSVE) { - StackOffset FPOffset = - StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset); + StackOffset FPOffset = StackOffset::get( + -AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset); StackOffset SPOffset = SVEStackSize + StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(), ObjectOffset); + + // With split SVE objects the ObjectOffset is relative to the split area + // (i.e. the PPR area or ZPR area respectively). + if (AFI->hasSplitSVEObjects() && StackID == TargetStackID::ScalableVector) { + // If we're accessing an SVE vector with split SVE objects... + // - From the FP we need to move down past the PPR area: + FPOffset -= PPRStackSize; + // - From the SP we only need to move up to the ZPR area: + SPOffset -= PPRStackSize; + // Note: `SPOffset = SVEStackSize + ...`, so `-= PPRStackSize` results in + // `SPOffset = ZPRStackSize + ...`. + } + if (FPAfterSVECalleeSaves) { FPOffset += StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()); if (-ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) { @@ -1401,6 +1463,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( SPOffset += StackOffset::getFixed(AFI->getCalleeSavedStackSize()); } } + // Always use the FP for SVE spills if available and beneficial. if (hasFP(MF) && (SPOffset.getFixed() || FPOffset.getScalable() < SPOffset.getScalable() || @@ -1408,13 +1471,13 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( FrameReg = RegInfo->getFrameRegister(MF); return FPOffset; } - FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() : (unsigned)AArch64::SP; + return SPOffset; } - StackOffset ScalableOffset = {}; + StackOffset SVEAreaOffset = {}; if (FPAfterSVECalleeSaves) { // In this stack layout, the FP is in between the callee saves and other // SVE allocations. @@ -1422,25 +1485,25 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()); if (UseFP) { if (isFixed) - ScalableOffset = SVECalleeSavedStack; + SVEAreaOffset = SVECalleeSavedStack; else if (!isCSR) - ScalableOffset = SVECalleeSavedStack - SVEStackSize; + SVEAreaOffset = SVECalleeSavedStack - SVEStackSize; } else { if (isFixed) - ScalableOffset = SVEStackSize; + SVEAreaOffset = SVEStackSize; else if (isCSR) - ScalableOffset = SVEStackSize - SVECalleeSavedStack; + SVEAreaOffset = SVEStackSize - SVECalleeSavedStack; } } else { if (UseFP && !(isFixed || isCSR)) - ScalableOffset = -SVEStackSize; + SVEAreaOffset = -SVEStackSize; if (!UseFP && (isFixed || isCSR)) - ScalableOffset = SVEStackSize; + SVEAreaOffset = SVEStackSize; } if (UseFP) { FrameReg = RegInfo->getFrameRegister(MF); - return StackOffset::getFixed(FPOffset) + ScalableOffset; + return StackOffset::getFixed(FPOffset) + SVEAreaOffset; } // Use the base pointer if we have one. @@ -1457,7 +1520,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( Offset -= AFI->getLocalStackSize(); } - return StackOffset::getFixed(Offset) + ScalableOffset; + return StackOffset::getFixed(Offset) + SVEAreaOffset; } static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { @@ -1614,11 +1677,25 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, RegInc = -1; FirstReg = Count - 1; } + bool FPAfterSVECalleeSaves = IsWindows && AFI->getSVECalleeSavedStackSize(); - int ScalableByteOffset = - FPAfterSVECalleeSaves ? 0 : AFI->getSVECalleeSavedStackSize(); + + int ZPRByteOffset = 0; + int PPRByteOffset = 0; + bool SplitPPRs = AFI->hasSplitSVEObjects(); + if (SplitPPRs) { + ZPRByteOffset = AFI->getZPRCalleeSavedStackSize(); + PPRByteOffset = AFI->getPPRCalleeSavedStackSize(); + } else if (!FPAfterSVECalleeSaves) { + ZPRByteOffset = + AFI->getZPRCalleeSavedStackSize() + AFI->getPPRCalleeSavedStackSize(); + // Unused: Everything goes in ZPR space. + PPRByteOffset = 0; + } + bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace(); Register LastReg = 0; + bool HasCSHazardPadding = AFI->hasStackHazardSlotIndex() && !SplitPPRs; // When iterating backwards, the loop condition relies on unsigned wraparound. for (unsigned i = FirstReg; i < Count; i += RegInc) { @@ -1647,8 +1724,12 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, llvm_unreachable("Unsupported register class."); } + int &ScalableByteOffset = RPI.Type == RegPairInfo::PPR && SplitPPRs + ? PPRByteOffset + : ZPRByteOffset; + // Add the stack hazard size as we transition from GPR->FPR CSRs. - if (AFI->hasStackHazardSlotIndex() && + if (HasCSHazardPadding && (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) && AArch64InstrInfo::isFpOrNEON(RPI.Reg1)) ByteOffset += StackFillDir * StackHazardSize; @@ -1656,7 +1737,7 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, int Scale = TRI->getSpillSize(*RPI.RC); // Add the next reg to the pair if it is in the same register class. - if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) { + if (unsigned(i + RegInc) < Count && !HasCSHazardPadding) { MCRegister NextReg = CSI[i + RegInc].getReg(); bool IsFirst = i == FirstReg; switch (RPI.Type) { @@ -2021,10 +2102,14 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( } // Update the StackIDs of the SVE stack slots. MachineFrameInfo &MFI = MF.getFrameInfo(); - if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) { + if (RPI.Type == RegPairInfo::ZPR) { MFI.setStackID(FrameIdxReg1, TargetStackID::ScalableVector); if (RPI.isPaired()) MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector); + } else if (RPI.Type == RegPairInfo::PPR) { + MFI.setStackID(FrameIdxReg1, TargetStackID::ScalablePredicateVector); + if (RPI.isPaired()) + MFI.setStackID(FrameIdxReg2, TargetStackID::ScalablePredicateVector); } } return true; @@ -2199,6 +2284,13 @@ static std::optional<int> getLdStFrameID(const MachineInstr &MI, return getMMOFrameID(*MI.memoperands_begin(), MFI); } +// Returns true if the LDST MachineInstr \p MI is a PPR access. +static bool isPPRAccess(const MachineInstr &MI) { + return MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO && + MI.getOpcode() != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO && + AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()); +} + // Check if a Hazard slot is needed for the current function, and if so create // one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex, // which can be used to determine if any hazard padding is needed. @@ -2222,26 +2314,50 @@ void AArch64FrameLowering::determineStackHazardSlot( bool HasFPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) { return AArch64::FPR64RegClass.contains(Reg) || AArch64::FPR128RegClass.contains(Reg) || - AArch64::ZPRRegClass.contains(Reg) || - AArch64::PPRRegClass.contains(Reg); + AArch64::ZPRRegClass.contains(Reg); + }); + bool HasPPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) { + return AArch64::PPRRegClass.contains(Reg); }); bool HasFPRStackObjects = false; - if (!HasFPRCSRs) { - std::vector<unsigned> FrameObjects(MFI.getObjectIndexEnd()); + bool HasPPRStackObjects = false; + if (!HasFPRCSRs || SplitSVEObjects) { + enum SlotType : uint8_t { + Unknown = 0, + ZPRorFPR = 1 << 0, + PPR = 1 << 1, + GPR = 1 << 2, + LLVM_MARK_AS_BITMASK_ENUM(GPR) + }; + + // Find stack slots solely used for one kind of register (ZPR, PPR, etc.), + // based on the kinds of accesses used in the function. + SmallVector<SlotType> SlotTypes(MFI.getObjectIndexEnd(), SlotType::Unknown); for (auto &MBB : MF) { for (auto &MI : MBB) { std::optional<int> FI = getLdStFrameID(MI, MFI); - if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) { - if (MFI.getStackID(*FI) == TargetStackID::ScalableVector || - AArch64InstrInfo::isFpOrNEON(MI)) - FrameObjects[*FI] |= 2; - else - FrameObjects[*FI] |= 1; + if (!FI || FI < 0 || FI > int(SlotTypes.size())) + continue; + if (MFI.hasScalableStackID(*FI)) { + SlotTypes[*FI] |= + isPPRAccess(MI) ? SlotType::PPR : SlotType::ZPRorFPR; + } else { + SlotTypes[*FI] |= AArch64InstrInfo::isFpOrNEON(MI) + ? SlotType::ZPRorFPR + : SlotType::GPR; } } } - HasFPRStackObjects = - any_of(FrameObjects, [](unsigned B) { return (B & 3) == 2; }); + + for (int FI = 0; FI < int(SlotTypes.size()); ++FI) { + HasFPRStackObjects |= SlotTypes[FI] == SlotType::ZPRorFPR; + // For SplitSVEObjects remember that this stack slot is a predicate, this + // will be needed later when determining the frame layout. + if (SlotTypes[FI] == SlotType::PPR) { + MFI.setStackID(FI, TargetStackID::ScalablePredicateVector); + HasPPRStackObjects = true; + } + } } if (HasFPRCSRs || HasFPRStackObjects) { @@ -2250,6 +2366,78 @@ void AArch64FrameLowering::determineStackHazardSlot( << StackHazardSize << "\n"); AFI->setStackHazardSlotIndex(ID); } + + // Determine if we should use SplitSVEObjects. This should only be used if + // there's a possibility of a stack hazard between PPRs and ZPRs or FPRs. + if (SplitSVEObjects) { + if (!HasPPRCSRs && !HasPPRStackObjects) { + LLVM_DEBUG( + dbgs() << "Not using SplitSVEObjects as no PPRs are on the stack\n"); + return; + } + + if (!HasFPRCSRs && !HasFPRStackObjects) { + LLVM_DEBUG( + dbgs() + << "Not using SplitSVEObjects as no FPRs or ZPRs are on the stack\n"); + return; + } + + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + if (MFI.hasVarSizedObjects() || TRI->hasStackRealignment(MF)) { + LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with variable " + "sized objects or realignment\n"); + return; + } + + if (arePPRsSpilledAsZPR(MF)) { + LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with " + "-aarch64-enable-zpr-predicate-spills"); + return; + } + + // If another calling convention is explicitly set FPRs can't be promoted to + // ZPR callee-saves. + if (!is_contained({CallingConv::C, CallingConv::Fast, + CallingConv::AArch64_SVE_VectorCall}, + MF.getFunction().getCallingConv())) { + LLVM_DEBUG( + dbgs() << "Calling convention is not supported with SplitSVEObjects"); + return; + } + + [[maybe_unused]] const AArch64Subtarget &Subtarget = + MF.getSubtarget<AArch64Subtarget>(); + assert(Subtarget.isSVEorStreamingSVEAvailable() && + "Expected SVE to be available for PPRs"); + + // With SplitSVEObjects the CS hazard padding is placed between the + // PPRs and ZPRs. If there are any FPR CS there would be a hazard between + // them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs. + BitVector FPRZRegs(SavedRegs.size()); + for (size_t Reg = 0, E = SavedRegs.size(); HasFPRCSRs && Reg < E; ++Reg) { + BitVector::reference RegBit = SavedRegs[Reg]; + if (!RegBit) + continue; + unsigned SubRegIdx = 0; + if (AArch64::FPR64RegClass.contains(Reg)) + SubRegIdx = AArch64::dsub; + else if (AArch64::FPR128RegClass.contains(Reg)) + SubRegIdx = AArch64::zsub; + else + continue; + // Clear the bit for the FPR save. + RegBit = false; + // Mark that we should save the corresponding ZPR. + Register ZReg = + TRI->getMatchingSuperReg(Reg, SubRegIdx, &AArch64::ZPRRegClass); + FPRZRegs.set(ZReg); + } + SavedRegs |= FPRZRegs; + + AFI->setSplitSVEObjects(true); + LLVM_DEBUG(dbgs() << "SplitSVEObjects enabled!\n"); + } } void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, @@ -2260,10 +2448,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); - const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); unsigned UnspilledCSGPR = AArch64::NoRegister; unsigned UnspilledCSGPRPaired = AArch64::NoRegister; @@ -2382,17 +2571,26 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(AArch64::X18); } + // Determine if a Hazard slot should be used and where it should go. + // If SplitSVEObjects is used, the hazard padding is placed between the PPRs + // and ZPRs. Otherwise, it goes in the callee save area. + determineStackHazardSlot(MF, SavedRegs); + // Calculates the callee saved stack size. unsigned CSStackSize = 0; - unsigned SVECSStackSize = 0; + unsigned ZPRCSStackSize = 0; + unsigned PPRCSStackSize = 0; const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); for (unsigned Reg : SavedRegs.set_bits()) { auto *RC = TRI->getMinimalPhysRegClass(Reg); assert(RC && "expected register class!"); auto SpillSize = TRI->getSpillSize(*RC); - if (AArch64::PPRRegClass.contains(Reg) || - AArch64::ZPRRegClass.contains(Reg)) - SVECSStackSize += SpillSize; + bool IsZPR = AArch64::ZPRRegClass.contains(Reg); + bool IsPPR = !IsZPR && AArch64::PPRRegClass.contains(Reg); + if (IsZPR || (IsPPR && arePPRsSpilledAsZPR(MF))) + ZPRCSStackSize += SpillSize; + else if (IsPPR) + PPRCSStackSize += SpillSize; else CSStackSize += SpillSize; } @@ -2402,17 +2600,15 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // only 64-bit GPRs can be added to SavedRegs. unsigned NumSavedRegs = SavedRegs.count(); + // If we have hazard padding in the CS area add that to the size. + if (AFI->isStackHazardIncludedInCalleeSaveArea()) + CSStackSize += getStackHazardSize(MF); + // Increase the callee-saved stack size if the function has streaming mode // changes, as we will need to spill the value of the VG register. if (requiresSaveVG(MF)) CSStackSize += 8; - // Determine if a Hazard slot should be used, and increase the CSStackSize by - // StackHazardSize if so. - determineStackHazardSlot(MF, SavedRegs); - if (AFI->hasStackHazardSlotIndex()) - CSStackSize += getStackHazardSize(MF); - // If we must call __arm_get_current_vg in the prologue preserve the LR. if (requiresSaveVG(MF) && !Subtarget.hasSVE()) SavedRegs.set(AArch64::LR); @@ -2433,8 +2629,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, }); // If any callee-saved registers are used, the frame cannot be eliminated. - int64_t SVEStackSize = - alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16); + auto [ZPRLocalStackSize, PPRLocalStackSize] = + determineSVEStackSizes(MF, AssignObjectOffsets::No); + uint64_t SVELocals = ZPRLocalStackSize + PPRLocalStackSize; + uint64_t SVEStackSize = + alignTo(ZPRCSStackSize + PPRCSStackSize + SVELocals, 16); bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize; // The CSR spill slots have not been allocated yet, so estimateStackSize @@ -2519,7 +2718,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // instructions. AFI->setCalleeSavedStackSize(AlignedCSStackSize); AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize); - AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16)); + AFI->setSVECalleeSavedStackSize(ZPRCSStackSize, alignTo(PPRCSStackSize, 16)); } bool AArch64FrameLowering::assignCalleeSavedSpillSlots( @@ -2572,7 +2771,7 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); // Create a hazard slot as we switch between GPR and FPR CSRs. - if (AFI->hasStackHazardSlotIndex() && + if (AFI->isStackHazardIncludedInCalleeSaveArea() && (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) && AArch64InstrInfo::isFpOrNEON(Reg)) { assert(HazardSlotIndex == std::numeric_limits<int>::max() && @@ -2611,7 +2810,7 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( } // Add hazard slot in the case where no FPR CSRs are present. - if (AFI->hasStackHazardSlotIndex() && + if (AFI->isStackHazardIncludedInCalleeSaveArea() && HazardSlotIndex == std::numeric_limits<int>::max()) { HazardSlotIndex = MFI.CreateStackObject(StackHazardSize, Align(8), true); LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex @@ -2658,7 +2857,6 @@ static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI, assert((Max == std::numeric_limits<int>::min() || Max + 1 == CS.getFrameIdx()) && "SVE CalleeSaves are not consecutive"); - Min = std::min(Min, CS.getFrameIdx()); Max = std::max(Max, CS.getFrameIdx()); } @@ -2666,43 +2864,64 @@ static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI, return Min != std::numeric_limits<int>::max(); } -// Process all the SVE stack objects and determine offsets for each -// object. If AssignOffsets is true, the offsets get assigned. -// Fills in the first and last callee-saved frame indices into -// Min/MaxCSFrameIndex, respectively. -// Returns the size of the stack. -static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, - int &MinCSFrameIndex, - int &MaxCSFrameIndex, - bool AssignOffsets) { +static SVEStackSizes determineSVEStackSizes(MachineFunction &MF, + AssignObjectOffsets AssignOffsets) { + MachineFrameInfo &MFI = MF.getFrameInfo(); + auto *AFI = MF.getInfo<AArch64FunctionInfo>(); + + SVEStackSizes SVEStack{}; + + // With SplitSVEObjects we maintain separate stack offsets for predicates + // (PPRs) and SVE vectors (ZPRs). When SplitSVEObjects is disabled predicates + // are included in the SVE vector area. + uint64_t &ZPRStackTop = SVEStack.ZPRStackSize; + uint64_t &PPRStackTop = + AFI->hasSplitSVEObjects() ? SVEStack.PPRStackSize : SVEStack.ZPRStackSize; + #ifndef NDEBUG // First process all fixed stack objects. for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) - assert(MFI.getStackID(I) != TargetStackID::ScalableVector && + assert(!MFI.hasScalableStackID(I) && "SVE vectors should never be passed on the stack by value, only by " "reference."); #endif - auto Assign = [&MFI](int FI, int64_t Offset) { + auto AllocateObject = [&](int FI) { + uint64_t &StackTop = MFI.getStackID(FI) == TargetStackID::ScalableVector + ? ZPRStackTop + : PPRStackTop; + + // FIXME: Given that the length of SVE vectors is not necessarily a power of + // two, we'd need to align every object dynamically at runtime if the + // alignment is larger than 16. This is not yet supported. + Align Alignment = MFI.getObjectAlign(FI); + if (Alignment > Align(16)) + report_fatal_error( + "Alignment of scalable vectors > 16 bytes is not yet supported"); + + StackTop += MFI.getObjectSize(FI); + StackTop = alignTo(StackTop, Alignment); + + assert(StackTop < std::numeric_limits<int64_t>::max() && + "SVE StackTop far too large?!"); + + int64_t Offset = -int64_t(StackTop); + if (AssignOffsets == AssignObjectOffsets::Yes) + MFI.setObjectOffset(FI, Offset); + LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n"); - MFI.setObjectOffset(FI, Offset); }; - int64_t Offset = 0; - // Then process all callee saved slots. + int MinCSFrameIndex, MaxCSFrameIndex; if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) { - // Assign offsets to the callee save slots. - for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) { - Offset += MFI.getObjectSize(I); - Offset = alignTo(Offset, MFI.getObjectAlign(I)); - if (AssignOffsets) - Assign(I, -Offset); - } + for (int FI = MinCSFrameIndex; FI <= MaxCSFrameIndex; ++FI) + AllocateObject(FI); } - // Ensure that the Callee-save area is aligned to 16bytes. - Offset = alignTo(Offset, Align(16U)); + // Ensure the CS area is 16-byte aligned. + PPRStackTop = alignTo(PPRStackTop, Align(16U)); + ZPRStackTop = alignTo(ZPRStackTop, Align(16U)); // Create a buffer of SVE objects to allocate and sort it. SmallVector<int, 8> ObjectsToAllocate; @@ -2715,48 +2934,31 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, if (MFI.getStackID(StackProtectorFI) == TargetStackID::ScalableVector) ObjectsToAllocate.push_back(StackProtectorFI); } - for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) { - unsigned StackID = MFI.getStackID(I); - if (StackID != TargetStackID::ScalableVector) - continue; - if (I == StackProtectorFI) + + for (int FI = 0, E = MFI.getObjectIndexEnd(); FI != E; ++FI) { + if (FI == StackProtectorFI || MFI.isDeadObjectIndex(FI)) continue; - if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex) + if (MaxCSFrameIndex >= FI && FI >= MinCSFrameIndex) continue; - if (MFI.isDeadObjectIndex(I)) + + if (MFI.getStackID(FI) != TargetStackID::ScalableVector && + MFI.getStackID(FI) != TargetStackID::ScalablePredicateVector) continue; - ObjectsToAllocate.push_back(I); + ObjectsToAllocate.push_back(FI); } // Allocate all SVE locals and spills - for (unsigned FI : ObjectsToAllocate) { - Align Alignment = MFI.getObjectAlign(FI); - // FIXME: Given that the length of SVE vectors is not necessarily a power of - // two, we'd need to align every object dynamically at runtime if the - // alignment is larger than 16. This is not yet supported. - if (Alignment > Align(16)) - report_fatal_error( - "Alignment of scalable vectors > 16 bytes is not yet supported"); - - Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment); - if (AssignOffsets) - Assign(FI, -Offset); - } + for (unsigned FI : ObjectsToAllocate) + AllocateObject(FI); - return Offset; -} + PPRStackTop = alignTo(PPRStackTop, Align(16U)); + ZPRStackTop = alignTo(ZPRStackTop, Align(16U)); -int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets( - MachineFrameInfo &MFI) const { - int MinCSFrameIndex, MaxCSFrameIndex; - return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false); -} + if (AssignOffsets == AssignObjectOffsets::Yes) + AFI->setStackSizeSVE(SVEStack.ZPRStackSize, SVEStack.PPRStackSize); -int64_t AArch64FrameLowering::assignSVEStackObjectOffsets( - MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const { - return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, - true); + return SVEStack; } /// Attempts to scavenge a register from \p ScavengeableRegs given the used @@ -3070,12 +3272,7 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && "Upwards growing stack unsupported"); - int MinCSFrameIndex, MaxCSFrameIndex; - int64_t SVEStackSize = - assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex); - - AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U)); - AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex); + (void)determineSVEStackSizes(MF, AssignObjectOffsets::Yes); // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. @@ -3359,7 +3556,8 @@ void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI, Register Reg; FrameRegOffset = TFI->resolveFrameOffsetReference( - *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg, + *MF, FirstTagStore.Offset, false /*isFixed*/, + TargetStackID::Default /*StackID*/, Reg, /*PreferFP=*/false, /*ForSimm=*/true); FrameReg = Reg; FrameRegUpdate = std::nullopt; @@ -3597,7 +3795,7 @@ StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP( // Go to common code if we cannot provide sp + offset. if (MFI.hasVarSizedObjects() || - MF.getInfo<AArch64FunctionInfo>()->getStackSizeSVE() || + MF.getInfo<AArch64FunctionInfo>()->hasSVEStackSize() || MF.getSubtarget().getRegisterInfo()->hasStackRealignment(MF)) return getFrameIndexReference(MF, FI, FrameReg); @@ -3699,10 +3897,12 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) { void AArch64FrameLowering::orderFrameObjects( const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const { - if (!OrderFrameObjects || ObjectsToAllocate.empty()) + const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>(); + + if ((!OrderFrameObjects && !AFI.hasSplitSVEObjects()) || + ObjectsToAllocate.empty()) return; - const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>(); const MachineFrameInfo &MFI = MF.getFrameInfo(); std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd()); for (auto &Obj : ObjectsToAllocate) { @@ -4080,7 +4280,7 @@ void AArch64FrameLowering::emitRemarks( } unsigned RegTy = StackAccess::AccessType::GPR; - if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) { + if (MFI.hasScalableStackID(FrameIdx)) { // SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO // spill/fill the predicate as a data vector (so are an FPR access). if (MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO && diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 7bba053..32a9bd8 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -24,6 +24,11 @@ class AArch64FunctionInfo; class AArch64PrologueEmitter; class AArch64EpilogueEmitter; +struct SVEStackSizes { + uint64_t ZPRStackSize{0}; + uint64_t PPRStackSize{0}; +}; + class AArch64FrameLowering : public TargetFrameLowering { public: explicit AArch64FrameLowering() @@ -64,8 +69,9 @@ public: bool ForSimm) const; StackOffset resolveFrameOffsetReference(const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, - bool isSVE, Register &FrameReg, - bool PreferFP, bool ForSimm) const; + TargetStackID::Value StackID, + Register &FrameReg, bool PreferFP, + bool ForSimm) const; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, ArrayRef<CalleeSavedInfo> CSI, @@ -124,6 +130,7 @@ public: return false; case TargetStackID::Default: case TargetStackID::ScalableVector: + case TargetStackID::ScalablePredicateVector: case TargetStackID::NoAlloc: return true; } @@ -132,7 +139,8 @@ public: bool isStackIdSafeForLocalArea(unsigned StackId) const override { // We don't support putting SVE objects into the pre-allocated local // frame block at the moment. - return StackId != TargetStackID::ScalableVector; + return (StackId != TargetStackID::ScalableVector && + StackId != TargetStackID::ScalablePredicateVector); } void @@ -145,7 +153,17 @@ public: bool requiresSaveVG(const MachineFunction &MF) const; - StackOffset getSVEStackSize(const MachineFunction &MF) const; + /// Returns the size of the entire ZPR stackframe (calleesaves + spills). + StackOffset getZPRStackSize(const MachineFunction &MF) const; + + /// Returns the size of the entire PPR stackframe (calleesaves + spills + + /// hazard padding). + StackOffset getPPRStackSize(const MachineFunction &MF) const; + + /// Returns the size of the entire SVE stackframe (PPRs + ZPRs). + StackOffset getSVEStackSize(const MachineFunction &MF) const { + return getZPRStackSize(MF) + getPPRStackSize(MF); + } friend class AArch64PrologueEpilogueCommon; friend class AArch64PrologueEmitter; @@ -165,10 +183,6 @@ private: /// Returns true if CSRs should be paired. bool producePairRegisters(MachineFunction &MF) const; - int64_t estimateSVEStackObjectOffsets(MachineFrameInfo &MF) const; - int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF, - int &MinCSFrameIndex, - int &MaxCSFrameIndex) const; /// Make a determination whether a Hazard slot is used and create it if /// needed. void determineStackHazardSlot(MachineFunction &MF, diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 6a1b06e..e7b2d20 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -2089,7 +2089,8 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node, if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue)) return; - SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)}; + SDValue Chain = Node->getOperand(0); + SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4), Chain}; SDLoc DL(Node); EVT VT = Node->getValueType(0); @@ -2110,14 +2111,15 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node, void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc) { - SDValue ZtValue; - SmallVector<SDValue, 4> Ops; if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue)) return; - Ops.push_back(ZtValue); - Ops.push_back(createZMulTuple({Node->getOperand(3), Node->getOperand(4)})); + SDValue Chain = Node->getOperand(0); + SDValue Ops[] = {ZtValue, + createZMulTuple({Node->getOperand(3), Node->getOperand(4)}), + Chain}; + SDLoc DL(Node); EVT VT = Node->getValueType(0); @@ -7495,7 +7497,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, int FI = cast<FrameIndexSDNode>(N)->getIndex(); // We can only encode VL scaled offsets, so only fold in frame indexes // referencing SVE objects. - if (MFI.getStackID(FI) == TargetStackID::ScalableVector) { + if (MFI.hasScalableStackID(FI)) { Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64); return true; @@ -7541,7 +7543,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, int FI = cast<FrameIndexSDNode>(Base)->getIndex(); // We can only encode VL scaled offsets, so only fold in frame indexes // referencing SVE objects. - if (MFI.getStackID(FI) == TargetStackID::ScalableVector) + if (MFI.hasScalableStackID(FI)) Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 45f5235..70d5ad7d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1537,6 +1537,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT, VT, Custom); setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); + setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::MULHS, VT, Custom); setOperationAction(ISD::MULHU, VT, Custom); @@ -6617,7 +6618,6 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, "llvm.eh.recoverfp must take a function as the first argument"); return IncomingFPOp; } - case Intrinsic::aarch64_neon_vsri: case Intrinsic::aarch64_neon_vsli: case Intrinsic::aarch64_sve_sri: @@ -9256,8 +9256,7 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, (MI.getOpcode() == AArch64::ADDXri || MI.getOpcode() == AArch64::SUBXri)) { const MachineOperand &MO = MI.getOperand(1); - if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) == - TargetStackID::ScalableVector) + if (MO.isFI() && MF.getFrameInfo().hasScalableStackID(MO.getIndex())) MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false, /*IsImplicit=*/true)); } @@ -9704,8 +9703,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); MachineFrameInfo &MFI = MF.getFrameInfo(); int FI = MFI.CreateStackObject(StoreSize, Alignment, false); - if (isScalable) - MFI.setStackID(FI, TargetStackID::ScalableVector); + if (isScalable) { + bool IsPred = VA.getValVT() == MVT::aarch64svcount || + VA.getValVT().getVectorElementType() == MVT::i1; + MFI.setStackID(FI, IsPred ? TargetStackID::ScalablePredicateVector + : TargetStackID::ScalableVector); + } MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI); SDValue Ptr = DAG.getFrameIndex( @@ -15154,9 +15157,7 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { : Shift.getOperand(1); unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; - SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm); - - return ResultSLI; + return DAG.getNode(Inst, DL, VT, X, Y, Imm); } static SDValue tryLowerToBSL(SDValue N, SelectionDAG &DAG) { @@ -27234,6 +27235,21 @@ static bool isLanes1toNKnownZero(SDValue Op) { } } +// Return true if the vector operation can guarantee that the first lane of its +// result is active. +static bool isLane0KnownActive(SDValue Op) { + switch (Op.getOpcode()) { + default: + return false; + case AArch64ISD::REINTERPRET_CAST: + return isLane0KnownActive(Op->getOperand(0)); + case ISD::SPLAT_VECTOR: + return isOneConstant(Op.getOperand(0)); + case AArch64ISD::PTRUE: + return Op.getConstantOperandVal(0) == AArch64SVEPredPattern::all; + }; +} + static SDValue removeRedundantInsertVectorElt(SDNode *N) { assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!"); SDValue InsertVec = N->getOperand(0); @@ -27519,6 +27535,32 @@ static SDValue performMULLCombine(SDNode *N, return SDValue(); } +static SDValue performPTestFirstCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + if (DCI.isBeforeLegalize()) + return SDValue(); + + SDLoc DL(N); + auto Mask = N->getOperand(0); + auto Pred = N->getOperand(1); + + if (!isLane0KnownActive(Mask)) + return SDValue(); + + if (Pred->getOpcode() == AArch64ISD::REINTERPRET_CAST) + Pred = Pred->getOperand(0); + + if (Pred->getOpcode() == ISD::CONCAT_VECTORS) { + Pred = Pred->getOperand(0); + Pred = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pred); + return DAG.getNode(AArch64ISD::PTEST_FIRST, DL, N->getValueType(0), Mask, + Pred); + } + + return SDValue(); +} + static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -27875,6 +27917,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case AArch64ISD::UMULL: case AArch64ISD::PMULL: return performMULLCombine(N, DCI, DAG); + case AArch64ISD::PTEST_FIRST: + return performPTestFirstCombine(N, DCI, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (N->getConstantOperandVal(1)) { @@ -29564,7 +29608,7 @@ void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const { // than doing it here in finalizeLowering. if (MFI.hasStackProtectorIndex()) { for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) { - if (MFI.getStackID(i) == TargetStackID::ScalableVector && + if (MFI.hasScalableStackID(i) && MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) { MFI.setStackID(MFI.getStackProtectorIndex(), TargetStackID::ScalableVector); diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index f07d351..6ef0a95 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -10176,28 +10176,6 @@ multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm, (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>; } -multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> { - def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?}, - FPR8, FPR8, vecshiftR8, asm, []> { - let Inst{18-16} = imm{2-0}; - } - - def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?}, - FPR16, FPR16, vecshiftR16, asm, []> { - let Inst{19-16} = imm{3-0}; - } - - def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?}, - FPR32, FPR32, vecshiftR32, asm, []> { - let Inst{20-16} = imm{4-0}; - } - - def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?}, - FPR64, FPR64, vecshiftR64, asm, []> { - let Inst{21-16} = imm{5-0}; - } -} - //---------------------------------------------------------------------------- // AdvSIMD vector x indexed element //---------------------------------------------------------------------------- diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 5a51c81..5a90da1 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1503,6 +1503,13 @@ AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask, getElementSizeForOpcode(PredOpcode)) return PredOpcode; + // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since + // WHILEcc performs an implicit PTEST with an all active mask, setting + // the N flag as the PTEST_FIRST would. + if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST && + isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31) + return PredOpcode; + return {}; } @@ -5592,7 +5599,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register store without SVE store instructions"); Opc = AArch64::STR_PXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalablePredicateVector; } break; } @@ -5607,7 +5614,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, Opc = AArch64::STRSui; else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) { Opc = AArch64::STR_PPXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalablePredicateVector; } break; case 8: @@ -5777,7 +5784,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( if (IsPNR) PNRReg = DestReg; Opc = AArch64::LDR_PXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalablePredicateVector; } break; } @@ -5792,7 +5799,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( Opc = AArch64::LDRSui; else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) { Opc = AArch64::LDR_PPXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalablePredicateVector; } break; case 8: diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp index a81f5b3..b3c9656 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp @@ -23,12 +23,21 @@ using namespace llvm; +static std::optional<uint64_t> +getSVEStackSize(const AArch64FunctionInfo &MFI, + uint64_t (AArch64FunctionInfo::*GetStackSize)() const) { + if (!MFI.hasCalculatedStackSizeSVE()) + return std::nullopt; + return (MFI.*GetStackSize)(); +} + yaml::AArch64FunctionInfo::AArch64FunctionInfo( const llvm::AArch64FunctionInfo &MFI) : HasRedZone(MFI.hasRedZone()), - StackSizeSVE(MFI.hasCalculatedStackSizeSVE() - ? std::optional<uint64_t>(MFI.getStackSizeSVE()) - : std::nullopt), + StackSizeZPR( + getSVEStackSize(MFI, &llvm::AArch64FunctionInfo::getStackSizeZPR)), + StackSizePPR( + getSVEStackSize(MFI, &llvm::AArch64FunctionInfo::getStackSizePPR)), HasStackFrame(MFI.hasStackFrame() ? std::optional<bool>(MFI.hasStackFrame()) : std::nullopt) {} @@ -41,8 +50,9 @@ void AArch64FunctionInfo::initializeBaseYamlFields( const yaml::AArch64FunctionInfo &YamlMFI) { if (YamlMFI.HasRedZone) HasRedZone = YamlMFI.HasRedZone; - if (YamlMFI.StackSizeSVE) - setStackSizeSVE(*YamlMFI.StackSizeSVE); + if (YamlMFI.StackSizeZPR || YamlMFI.StackSizePPR) + setStackSizeSVE(YamlMFI.StackSizeZPR.value_or(0), + YamlMFI.StackSizePPR.value_or(0)); if (YamlMFI.HasStackFrame) setHasStackFrame(*YamlMFI.HasStackFrame); } diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 897c7e8..91e64e6 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -74,13 +74,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// Amount of stack frame size, not including callee-saved registers. uint64_t LocalStackSize = 0; - /// The start and end frame indices for the SVE callee saves. - int MinSVECSFrameIndex = 0; - int MaxSVECSFrameIndex = 0; - /// Amount of stack frame size used for saving callee-saved registers. unsigned CalleeSavedStackSize = 0; - unsigned SVECalleeSavedStackSize = 0; + unsigned ZPRCalleeSavedStackSize = 0; + unsigned PPRCalleeSavedStackSize = 0; bool HasCalleeSavedStackSize = false; bool HasSVECalleeSavedStackSize = false; @@ -137,9 +134,14 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// SVE stack size (for predicates and data vectors) are maintained here /// rather than in FrameInfo, as the placement and Stack IDs are target /// specific. - uint64_t StackSizeSVE = 0; + uint64_t StackSizeZPR = 0; + uint64_t StackSizePPR = 0; + + /// Are SVE objects (vectors and predicates) split into separate regions on + /// the stack. + bool SplitSVEObjects = false; - /// HasCalculatedStackSizeSVE indicates whether StackSizeSVE is valid. + /// HasCalculatedStackSizeSVE indicates whether StackSizeZPR/PPR is valid. bool HasCalculatedStackSizeSVE = false; /// Has a value when it is known whether or not the function uses a @@ -312,16 +314,25 @@ public: TailCallReservedStack = bytes; } - bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; } - - void setStackSizeSVE(uint64_t S) { + void setStackSizeSVE(uint64_t ZPR, uint64_t PPR) { + StackSizeZPR = ZPR; + StackSizePPR = PPR; HasCalculatedStackSizeSVE = true; - StackSizeSVE = S; } - uint64_t getStackSizeSVE() const { + uint64_t getStackSizeZPR() const { + assert(hasCalculatedStackSizeSVE()); + return StackSizeZPR; + } + uint64_t getStackSizePPR() const { assert(hasCalculatedStackSizeSVE()); - return StackSizeSVE; + return StackSizePPR; + } + + bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; } + + bool hasSVEStackSize() const { + return getStackSizeZPR() > 0 || getStackSizePPR() > 0; } bool hasStackFrame() const { return HasStackFrame; } @@ -329,7 +340,6 @@ public: bool isStackRealigned() const { return StackRealigned; } void setStackRealigned(bool s) { StackRealigned = s; } - bool hasCalleeSaveStackFreeSpace() const { return CalleeSaveStackHasFreeSpace; } @@ -414,29 +424,37 @@ public: } // Saves the CalleeSavedStackSize for SVE vectors in 'scalable bytes' - void setSVECalleeSavedStackSize(unsigned Size) { - SVECalleeSavedStackSize = Size; + void setSVECalleeSavedStackSize(unsigned ZPR, unsigned PPR) { + ZPRCalleeSavedStackSize = ZPR; + PPRCalleeSavedStackSize = PPR; HasSVECalleeSavedStackSize = true; } - unsigned getSVECalleeSavedStackSize() const { + unsigned getZPRCalleeSavedStackSize() const { assert(HasSVECalleeSavedStackSize && - "SVECalleeSavedStackSize has not been calculated"); - return SVECalleeSavedStackSize; + "ZPRCalleeSavedStackSize has not been calculated"); + return ZPRCalleeSavedStackSize; } - - void setMinMaxSVECSFrameIndex(int Min, int Max) { - MinSVECSFrameIndex = Min; - MaxSVECSFrameIndex = Max; + unsigned getPPRCalleeSavedStackSize() const { + assert(HasSVECalleeSavedStackSize && + "PPRCalleeSavedStackSize has not been calculated"); + return PPRCalleeSavedStackSize; } - int getMinSVECSFrameIndex() const { return MinSVECSFrameIndex; } - int getMaxSVECSFrameIndex() const { return MaxSVECSFrameIndex; } + unsigned getSVECalleeSavedStackSize() const { + assert(!hasSplitSVEObjects() && + "ZPRs and PPRs are split. Use get[ZPR|PPR]CalleeSavedStackSize()"); + return getZPRCalleeSavedStackSize() + getPPRCalleeSavedStackSize(); + } void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; } unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamicTLSAccesses; } + bool isStackHazardIncludedInCalleeSaveArea() const { + return hasStackHazardSlotIndex() && !hasSplitSVEObjects(); + } + std::optional<bool> hasRedZone() const { return HasRedZone; } void setHasRedZone(bool s) { HasRedZone = s; } @@ -472,6 +490,15 @@ public: StackHazardCSRSlotIndex = Index; } + bool hasSplitSVEObjects() const { return SplitSVEObjects; } + void setSplitSVEObjects(bool s) { SplitSVEObjects = s; } + + bool hasSVE_AAPCS(const MachineFunction &MF) const { + return hasSplitSVEObjects() || isSVECC() || + MF.getFunction().getCallingConv() == + CallingConv::AArch64_SVE_VectorCall; + } + SMEAttrs getSMEFnAttrs() const { return SMEFnAttrs; } unsigned getSRetReturnReg() const { return SRetReturnReg; } @@ -611,7 +638,8 @@ private: namespace yaml { struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo { std::optional<bool> HasRedZone; - std::optional<uint64_t> StackSizeSVE; + std::optional<uint64_t> StackSizeZPR; + std::optional<uint64_t> StackSizePPR; std::optional<bool> HasStackFrame; AArch64FunctionInfo() = default; @@ -624,7 +652,8 @@ struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo { template <> struct MappingTraits<AArch64FunctionInfo> { static void mapping(IO &YamlIO, AArch64FunctionInfo &MFI) { YamlIO.mapOptional("hasRedZone", MFI.HasRedZone); - YamlIO.mapOptional("stackSizeSVE", MFI.StackSizeSVE); + YamlIO.mapOptional("stackSizeZPR", MFI.StackSizeZPR); + YamlIO.mapOptional("stackSizePPR", MFI.StackSizePPR); YamlIO.mapOptional("hasStackFrame", MFI.HasStackFrame); } }; diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp index 09b3643..aed137c 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp @@ -48,21 +48,19 @@ bool AArch64PrologueEpilogueCommon::isVGInstruction( return Opc == TargetOpcode::COPY; } -// Convenience function to determine whether I is an SVE callee save. -static bool isSVECalleeSave(MachineBasicBlock::iterator I) { +// Convenience function to determine whether I is part of the ZPR callee saves. +static bool isPartOfZPRCalleeSaves(MachineBasicBlock::iterator I) { switch (I->getOpcode()) { default: return false; - case AArch64::PTRUE_C_B: case AArch64::LD1B_2Z_IMM: case AArch64::ST1B_2Z_IMM: case AArch64::STR_ZXI: - case AArch64::STR_PXI: case AArch64::LDR_ZXI: - case AArch64::LDR_PXI: - case AArch64::PTRUE_B: case AArch64::CPY_ZPzI_B: case AArch64::CMPNE_PPzZI_B: + case AArch64::PTRUE_C_B: + case AArch64::PTRUE_B: return I->getFlag(MachineInstr::FrameSetup) || I->getFlag(MachineInstr::FrameDestroy); case AArch64::SEH_SavePReg: @@ -71,6 +69,23 @@ static bool isSVECalleeSave(MachineBasicBlock::iterator I) { } } +// Convenience function to determine whether I is part of the PPR callee saves. +static bool isPartOfPPRCalleeSaves(MachineBasicBlock::iterator I) { + switch (I->getOpcode()) { + default: + return false; + case AArch64::STR_PXI: + case AArch64::LDR_PXI: + return I->getFlag(MachineInstr::FrameSetup) || + I->getFlag(MachineInstr::FrameDestroy); + } +} + +// Convenience function to determine whether I is part of the SVE callee saves. +static bool isPartOfSVECalleeSaves(MachineBasicBlock::iterator I) { + return isPartOfZPRCalleeSaves(I) || isPartOfPPRCalleeSaves(I); +} + AArch64PrologueEpilogueCommon::AArch64PrologueEpilogueCommon( MachineFunction &MF, MachineBasicBlock &MBB, const AArch64FrameLowering &AFL) @@ -316,7 +331,7 @@ bool AArch64PrologueEpilogueCommon::shouldCombineCSRLocalStackBump( // When there is an SVE area on the stack, always allocate the // callee-saves and spills/locals separately. - if (AFL.getSVEStackSize(MF)) + if (AFI->hasSVEStackSize()) return false; return true; @@ -639,7 +654,7 @@ void AArch64PrologueEmitter::emitPrologue() { // Now allocate space for the GPR callee saves. MachineBasicBlock::iterator MBBI = PrologueBeginI; - while (MBBI != EndI && isSVECalleeSave(MBBI)) + while (MBBI != EndI && isPartOfSVECalleeSaves(MBBI)) ++MBBI; FirstGPRSaveI = convertCalleeSaveRestoreToSPPrePostIncDec( MBBI, DL, -AFI->getCalleeSavedStackSize(), EmitAsyncCFI); @@ -669,7 +684,7 @@ void AArch64PrologueEmitter::emitPrologue() { MachineBasicBlock::iterator AfterGPRSavesI = FirstGPRSaveI; while (AfterGPRSavesI != EndI && AfterGPRSavesI->getFlag(MachineInstr::FrameSetup) && - !isSVECalleeSave(AfterGPRSavesI)) { + !isPartOfSVECalleeSaves(AfterGPRSavesI)) { if (CombineSPBump && // Only fix-up frame-setup load/store instructions. (!AFL.requiresSaveVG(MF) || !isVGInstruction(AfterGPRSavesI, TLI))) @@ -700,56 +715,105 @@ void AArch64PrologueEmitter::emitPrologue() { if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding); - StackOffset SVEStackSize = AFL.getSVEStackSize(MF); - StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize; - MachineBasicBlock::iterator CalleeSavesEnd = AfterGPRSavesI; + StackOffset PPRCalleeSavesSize = + StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize()); + StackOffset ZPRCalleeSavesSize = + StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize()); + StackOffset SVECalleeSavesSize = PPRCalleeSavesSize + ZPRCalleeSavesSize; + StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize; + StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize; + + std::optional<MachineBasicBlock::iterator> ZPRCalleeSavesBegin, + ZPRCalleeSavesEnd, PPRCalleeSavesBegin, PPRCalleeSavesEnd; StackOffset CFAOffset = StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes); - - // Process the SVE callee-saves to determine what space needs to be - // allocated. MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI; - if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { - LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize - << "\n"); - SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize); - SVELocalsSize = SVEStackSize - SVECalleeSavesSize; - // Find callee save instructions in frame. - // Note: With FPAfterSVECalleeSaves the callee saves have already been - // allocated. - if (!FPAfterSVECalleeSaves) { - MachineBasicBlock::iterator CalleeSavesBegin = AfterGPRSavesI; - assert(isSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); - while (isSVECalleeSave(AfterSVESavesI) && + if (!FPAfterSVECalleeSaves) { + // Process the SVE callee-saves to find the starts/ends of the ZPR and PPR + // areas. + PPRCalleeSavesBegin = AfterGPRSavesI; + if (PPRCalleeSavesSize) { + LLVM_DEBUG(dbgs() << "PPRCalleeSavedStackSize = " + << PPRCalleeSavesSize.getScalable() << "\n"); + + assert(isPartOfPPRCalleeSaves(*PPRCalleeSavesBegin) && + "Unexpected instruction"); + while (isPartOfPPRCalleeSaves(AfterSVESavesI) && + AfterSVESavesI != MBB.getFirstTerminator()) + ++AfterSVESavesI; + } + PPRCalleeSavesEnd = ZPRCalleeSavesBegin = AfterSVESavesI; + if (ZPRCalleeSavesSize) { + LLVM_DEBUG(dbgs() << "ZPRCalleeSavedStackSize = " + << ZPRCalleeSavesSize.getScalable() << "\n"); + assert(isPartOfZPRCalleeSaves(*ZPRCalleeSavesBegin) && + "Unexpected instruction"); + while (isPartOfZPRCalleeSaves(AfterSVESavesI) && AfterSVESavesI != MBB.getFirstTerminator()) ++AfterSVESavesI; - CalleeSavesEnd = AfterSVESavesI; - - StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes); - // Allocate space for the callee saves (if any). - allocateStackSpace(CalleeSavesBegin, 0, SVECalleeSavesSize, - EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects() || LocalsSize); } + ZPRCalleeSavesEnd = AfterSVESavesI; } - CFAOffset += SVECalleeSavesSize; if (EmitAsyncCFI) - emitCalleeSavedSVELocations(CalleeSavesEnd); - - // Allocate space for the rest of the frame including SVE locals. Align the - // stack as necessary. - assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) && - "Cannot use redzone with stack realignment"); - if (!AFL.canUseRedZone(MF)) { - // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have - // the correct value here, as NumBytes also includes padding bytes, - // which shouldn't be counted here. - allocateStackSpace(CalleeSavesEnd, RealignmentPadding, - SVELocalsSize + StackOffset::getFixed(NumBytes), + emitCalleeSavedSVELocations(AfterSVESavesI); + + if (AFI->hasSplitSVEObjects()) { + assert(!FPAfterSVECalleeSaves && + "Cannot use FPAfterSVECalleeSaves with aarch64-split-sve-objects"); + assert(!AFL.canUseRedZone(MF) && + "Cannot use redzone with aarch64-split-sve-objects"); + // TODO: Handle HasWinCFI/NeedsWinCFI? + assert(!NeedsWinCFI && + "WinCFI with aarch64-split-sve-objects is not supported"); + + // Split ZPR and PPR allocation. + // Allocate PPR callee saves + allocateStackSpace(*PPRCalleeSavesBegin, 0, PPRCalleeSavesSize, + EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects() || ZPRCalleeSavesSize || + ZPRLocalsSize || PPRLocalsSize); + CFAOffset += PPRCalleeSavesSize; + + // Allocate PPR locals + ZPR callee saves + assert(PPRCalleeSavesEnd == ZPRCalleeSavesBegin && + "Expected ZPR callee saves after PPR locals"); + allocateStackSpace(*PPRCalleeSavesEnd, RealignmentPadding, + PPRLocalsSize + ZPRCalleeSavesSize, + EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects() || ZPRLocalsSize); + CFAOffset += PPRLocalsSize + ZPRCalleeSavesSize; + + // Allocate ZPR locals + allocateStackSpace(*ZPRCalleeSavesEnd, RealignmentPadding, + ZPRLocalsSize + StackOffset::getFixed(NumBytes), EmitAsyncCFI && !HasFP, CFAOffset, MFI.hasVarSizedObjects()); + } else { + // Allocate space for the callee saves (if any). + StackOffset LocalsSize = + PPRLocalsSize + ZPRLocalsSize + StackOffset::getFixed(NumBytes); + if (!FPAfterSVECalleeSaves) + allocateStackSpace(AfterGPRSavesI, 0, SVECalleeSavesSize, + EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects() || LocalsSize); + CFAOffset += SVECalleeSavesSize; + + // Allocate space for the rest of the frame including SVE locals. Align the + // stack as necessary. + assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) && + "Cannot use redzone with stack realignment"); + if (!AFL.canUseRedZone(MF)) { + // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have + // the correct value here, as NumBytes also includes padding bytes, + // which shouldn't be counted here. + StackOffset SVELocalsSize = PPRLocalsSize + ZPRLocalsSize; + allocateStackSpace(AfterSVESavesI, RealignmentPadding, + SVELocalsSize + StackOffset::getFixed(NumBytes), + EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects()); + } } // If we need a base pointer, set it up here. It's whatever the value of the @@ -796,7 +860,8 @@ void AArch64PrologueEmitter::emitPrologue() { emitDefineCFAWithFP(AfterSVESavesI, FixedObject); } else { StackOffset TotalSize = - SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize()); + AFL.getSVEStackSize(MF) + + StackOffset::getFixed((int64_t)MFI.getStackSize()); CFIInstBuilder CFIBuilder(MBB, AfterSVESavesI, MachineInstr::FrameSetup); CFIBuilder.insertCFIInst( createDefCFA(RegInfo, /*FrameReg=*/AArch64::SP, /*Reg=*/AArch64::SP, @@ -1165,7 +1230,7 @@ void AArch64PrologueEmitter::emitCalleeSavedGPRLocations( CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); for (const auto &Info : CSI) { unsigned FrameIdx = Info.getFrameIdx(); - if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) + if (MFI.hasScalableStackID(FrameIdx)) continue; assert(!Info.isSpilledToReg() && "Spilling to registers not implemented"); @@ -1191,8 +1256,10 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations( AFL.getOffsetOfLocalArea(); } + StackOffset PPRStackSize = AFL.getPPRStackSize(MF); for (const auto &Info : CSI) { - if (MFI.getStackID(Info.getFrameIdx()) != TargetStackID::ScalableVector) + int FI = Info.getFrameIdx(); + if (!MFI.hasScalableStackID(FI)) continue; // Not all unwinders may know about SVE registers, so assume the lowest @@ -1203,9 +1270,13 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations( continue; StackOffset Offset = - StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) - + StackOffset::getScalable(MFI.getObjectOffset(FI)) - StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI)); + if (AFI->hasSplitSVEObjects() && + MFI.getStackID(FI) == TargetStackID::ScalableVector) + Offset -= PPRStackSize; + CFIBuilder.insertCFIInst( createCFAOffset(RegInfo, Reg, Offset, IncomingVGOffsetFromDefCFA)); } @@ -1322,7 +1393,7 @@ void AArch64EpilogueEmitter::emitEpilogue() { while (FirstGPRRestoreI != Begin) { --FirstGPRRestoreI; if (!FirstGPRRestoreI->getFlag(MachineInstr::FrameDestroy) || - (!FPAfterSVECalleeSaves && isSVECalleeSave(FirstGPRRestoreI))) { + (!FPAfterSVECalleeSaves && isPartOfSVECalleeSaves(FirstGPRRestoreI))) { ++FirstGPRRestoreI; break; } else if (CombineSPBump) @@ -1346,7 +1417,9 @@ void AArch64EpilogueEmitter::emitEpilogue() { if (HasFP && AFI->hasSwiftAsyncContext()) emitSwiftAsyncContextFramePointer(EpilogueEndI, DL); - const StackOffset &SVEStackSize = AFL.getSVEStackSize(MF); + StackOffset ZPRStackSize = AFL.getZPRStackSize(MF); + StackOffset PPRStackSize = AFL.getPPRStackSize(MF); + StackOffset SVEStackSize = ZPRStackSize + PPRStackSize; // If there is a single SP update, insert it before the ret and we're done. if (CombineSPBump) { @@ -1367,106 +1440,188 @@ void AArch64EpilogueEmitter::emitEpilogue() { NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); - // Process the SVE callee-saves to determine what space needs to be - // deallocated. - StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; - MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI, - RestoreEnd = FirstGPRRestoreI; - if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { - if (FPAfterSVECalleeSaves) - RestoreEnd = MBB.getFirstTerminator(); - - RestoreBegin = std::prev(RestoreEnd); - while (RestoreBegin != MBB.begin() && - isSVECalleeSave(std::prev(RestoreBegin))) - --RestoreBegin; - - assert(isSVECalleeSave(RestoreBegin) && - isSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction"); - - StackOffset CalleeSavedSizeAsOffset = - StackOffset::getScalable(CalleeSavedSize); - DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset; - DeallocateAfter = CalleeSavedSizeAsOffset; - } - - // Deallocate the SVE area. - if (FPAfterSVECalleeSaves) { - // If the callee-save area is before FP, restoring the FP implicitly - // deallocates non-callee-save SVE allocations. Otherwise, deallocate - // them explicitly. - if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) { - emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP, - DeallocateBefore, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); + if (!AFI->hasSplitSVEObjects()) { + // Process the SVE callee-saves to determine what space needs to be + // deallocated. + StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; + MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI, + RestoreEnd = FirstGPRRestoreI; + int64_t ZPRCalleeSavedSize = AFI->getZPRCalleeSavedStackSize(); + int64_t PPRCalleeSavedSize = AFI->getPPRCalleeSavedStackSize(); + int64_t SVECalleeSavedSize = ZPRCalleeSavedSize + PPRCalleeSavedSize; + + if (SVECalleeSavedSize) { + if (FPAfterSVECalleeSaves) + RestoreEnd = MBB.getFirstTerminator(); + + RestoreBegin = std::prev(RestoreEnd); + while (RestoreBegin != MBB.begin() && + isPartOfSVECalleeSaves(std::prev(RestoreBegin))) + --RestoreBegin; + + assert(isPartOfSVECalleeSaves(RestoreBegin) && + isPartOfSVECalleeSaves(std::prev(RestoreEnd)) && + "Unexpected instruction"); + + StackOffset CalleeSavedSizeAsOffset = + StackOffset::getScalable(SVECalleeSavedSize); + DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset; + DeallocateAfter = CalleeSavedSizeAsOffset; } - // Deallocate callee-save non-SVE registers. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); - - // Deallocate fixed objects. - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(FixedObject), TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); - - // Deallocate callee-save SVE registers. - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - DeallocateAfter, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); - } else if (SVEStackSize) { - int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize(); - // If we have stack realignment or variable-sized objects we must use the - // FP to restore SVE callee saves (as there is an unknown amount of - // data/padding between the SP and SVE CS area). - Register BaseForSVEDealloc = - (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP - : AArch64::SP; - if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) { - Register CalleeSaveBase = AArch64::FP; - if (int64_t CalleeSaveBaseOffset = - AFI->getCalleeSaveBaseToFrameRecordOffset()) { - // If we have have an non-zero offset to the non-SVE CS base we need to - // compute the base address by subtracting the offest in a temporary - // register first (to avoid briefly deallocating the SVE CS). - CalleeSaveBase = - MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); - emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP, - StackOffset::getFixed(-CalleeSaveBaseOffset), TII, - MachineInstr::FrameDestroy); - } - // The code below will deallocate the stack space space by moving the - // SP to the start of the SVE callee-save area. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase, - StackOffset::getScalable(-SVECalleeSavedSize), TII, - MachineInstr::FrameDestroy); - } else if (BaseForSVEDealloc == AArch64::SP) { - if (SVECalleeSavedSize) { - // Deallocate the non-SVE locals first before we can deallocate (and - // restore callee saves) from the SVE area. - emitFrameOffset( - MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(NumBytes), TII, MachineInstr::FrameDestroy, - false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, - SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize)); - NumBytes = 0; + // Deallocate the SVE area. + if (FPAfterSVECalleeSaves) { + // If the callee-save area is before FP, restoring the FP implicitly + // deallocates non-callee-save SVE allocations. Otherwise, deallocate + // them explicitly. + if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) { + emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP, + DeallocateBefore, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI); } + // Deallocate callee-save non-SVE registers. emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - DeallocateBefore, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, - SVEStackSize + - StackOffset::getFixed(NumBytes + PrologueSaveSize)); + StackOffset::getFixed(AFI->getCalleeSavedStackSize()), + TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, + &HasWinCFI); + + // Deallocate fixed objects. + emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(FixedObject), TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, + &HasWinCFI); + // Deallocate callee-save SVE registers. emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, DeallocateAfter, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, - DeallocateAfter + - StackOffset::getFixed(NumBytes + PrologueSaveSize)); + NeedsWinCFI, &HasWinCFI); + } else if (SVEStackSize) { + int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize(); + // If we have stack realignment or variable-sized objects we must use the + // FP to restore SVE callee saves (as there is an unknown amount of + // data/padding between the SP and SVE CS area). + Register BaseForSVEDealloc = + (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP + : AArch64::SP; + if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) { + Register CalleeSaveBase = AArch64::FP; + if (int64_t CalleeSaveBaseOffset = + AFI->getCalleeSaveBaseToFrameRecordOffset()) { + // If we have have an non-zero offset to the non-SVE CS base we need + // to compute the base address by subtracting the offest in a + // temporary register first (to avoid briefly deallocating the SVE + // CS). + CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister( + &AArch64::GPR64RegClass); + emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP, + StackOffset::getFixed(-CalleeSaveBaseOffset), TII, + MachineInstr::FrameDestroy); + } + // The code below will deallocate the stack space space by moving the + // SP to the start of the SVE callee-save area. + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase, + StackOffset::getScalable(-SVECalleeSavedSize), TII, + MachineInstr::FrameDestroy); + } else if (BaseForSVEDealloc == AArch64::SP) { + if (SVECalleeSavedSize) { + // Deallocate the non-SVE locals first before we can deallocate (and + // restore callee saves) from the SVE area. + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(NumBytes), TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, + &HasWinCFI, EmitCFI && !HasFP, + SVEStackSize + StackOffset::getFixed( + NumBytes + PrologueSaveSize)); + NumBytes = 0; + } + + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, + DeallocateBefore, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, + SVEStackSize + + StackOffset::getFixed(NumBytes + PrologueSaveSize)); + + emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, + DeallocateAfter, TII, MachineInstr::FrameDestroy, false, + NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, + DeallocateAfter + + StackOffset::getFixed(NumBytes + PrologueSaveSize)); + } + + if (EmitCFI) + emitCalleeSavedSVERestores(RestoreEnd); + } + } else if (AFI->hasSplitSVEObjects() && SVEStackSize) { + // TODO: Support stack realigment and variable-sized objects. + assert(!AFI->isStackRealigned() && !MFI.hasVarSizedObjects() && + "unexpected stack realignment or variable sized objects with split " + "SVE stack objects"); + // SplitSVEObjects. Determine the sizes and starts/ends of the ZPR and PPR + // areas. + auto ZPRCalleeSavedSize = + StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize()); + auto PPRCalleeSavedSize = + StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize()); + StackOffset PPRLocalsSize = PPRStackSize - PPRCalleeSavedSize; + StackOffset ZPRLocalsSize = ZPRStackSize - ZPRCalleeSavedSize; + + MachineBasicBlock::iterator PPRRestoreBegin = FirstGPRRestoreI, + PPRRestoreEnd = FirstGPRRestoreI; + if (PPRCalleeSavedSize) { + PPRRestoreBegin = std::prev(PPRRestoreEnd); + while (PPRRestoreBegin != MBB.begin() && + isPartOfPPRCalleeSaves(std::prev(PPRRestoreBegin))) + --PPRRestoreBegin; + } + + MachineBasicBlock::iterator ZPRRestoreBegin = PPRRestoreBegin, + ZPRRestoreEnd = PPRRestoreBegin; + if (ZPRCalleeSavedSize) { + ZPRRestoreBegin = std::prev(ZPRRestoreEnd); + while (ZPRRestoreBegin != MBB.begin() && + isPartOfZPRCalleeSaves(std::prev(ZPRRestoreBegin))) + --ZPRRestoreBegin; } + + auto CFAOffset = + SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize); + if (PPRCalleeSavedSize || ZPRCalleeSavedSize) { + // Deallocate the non-SVE locals first before we can deallocate (and + // restore callee saves) from the SVE area. + auto NonSVELocals = StackOffset::getFixed(NumBytes); + emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP, + NonSVELocals, TII, MachineInstr::FrameDestroy, false, + false, nullptr, EmitCFI && !HasFP, CFAOffset); + NumBytes = 0; + CFAOffset -= NonSVELocals; + } + + if (ZPRLocalsSize) { + emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP, + ZPRLocalsSize, TII, MachineInstr::FrameDestroy, false, + false, nullptr, EmitCFI && !HasFP, CFAOffset); + CFAOffset -= ZPRLocalsSize; + } + + if (PPRLocalsSize || ZPRCalleeSavedSize) { + assert(PPRRestoreBegin == ZPRRestoreEnd && + "Expected PPR restores after ZPR"); + emitFrameOffset(MBB, PPRRestoreBegin, DL, AArch64::SP, AArch64::SP, + PPRLocalsSize + ZPRCalleeSavedSize, TII, + MachineInstr::FrameDestroy, false, false, nullptr, + EmitCFI && !HasFP, CFAOffset); + CFAOffset -= PPRLocalsSize + ZPRCalleeSavedSize; + } + if (PPRCalleeSavedSize) { + emitFrameOffset(MBB, PPRRestoreEnd, DL, AArch64::SP, AArch64::SP, + PPRCalleeSavedSize, TII, MachineInstr::FrameDestroy, + false, false, nullptr, EmitCFI && !HasFP, CFAOffset); + } + + // We only emit CFI information for ZPRs so emit CFI after the ZPR restores. if (EmitCFI) - emitCalleeSavedSVERestores(RestoreEnd); + emitCalleeSavedSVERestores(ZPRRestoreEnd); } if (!HasFP) { @@ -1624,8 +1779,7 @@ void AArch64EpilogueEmitter::emitCalleeSavedRestores( CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameDestroy); for (const auto &Info : CSI) { - if (SVE != - (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)) + if (SVE != MFI.hasScalableStackID(Info.getFrameIdx())) continue; MCRegister Reg = Info.getReg(); diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 2b0c8ad..79975b0 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -71,6 +71,7 @@ bool AArch64RegisterInfo::regNeedsCFI(MCRegister Reg, const MCPhysReg * AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { assert(MF && "Invalid MachineFunction pointer."); + auto &AFI = *MF->getInfo<AArch64FunctionInfo>(); if (MF->getFunction().getCallingConv() == CallingConv::GHC) // GHC set of callee saved regs is empty as all those regs are @@ -101,10 +102,7 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_Win_AArch64_AAPCS_SwiftTail_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall) return CSR_Win_AArch64_AAVPCS_SaveList; - if (MF->getFunction().getCallingConv() == - CallingConv::AArch64_SVE_VectorCall) - return CSR_Win_AArch64_SVE_AAPCS_SaveList; - if (MF->getInfo<AArch64FunctionInfo>()->isSVECC()) + if (AFI.hasSVE_AAPCS(*MF)) return CSR_Win_AArch64_SVE_AAPCS_SaveList; return CSR_Win_AArch64_AAPCS_SaveList; } @@ -148,7 +146,7 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { // This is for OSes other than Windows; Windows is a separate case further // above. return CSR_AArch64_AAPCS_X18_SaveList; - if (MF->getInfo<AArch64FunctionInfo>()->isSVECC()) + if (AFI.hasSVE_AAPCS(*MF)) return CSR_AArch64_SVE_AAPCS_SaveList; return CSR_AArch64_AAPCS_SaveList; } @@ -158,6 +156,7 @@ AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const { assert(MF && "Invalid MachineFunction pointer."); assert(MF->getSubtarget<AArch64Subtarget>().isTargetDarwin() && "Invalid subtarget for getDarwinCalleeSavedRegs"); + auto &AFI = *MF->getInfo<AArch64FunctionInfo>(); if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check) report_fatal_error( @@ -205,7 +204,7 @@ AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const { return CSR_Darwin_AArch64_RT_AllRegs_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::Win64) return CSR_Darwin_AArch64_AAPCS_Win64_SaveList; - if (MF->getInfo<AArch64FunctionInfo>()->isSVECC()) + if (AFI.hasSVE_AAPCS(*MF)) return CSR_Darwin_AArch64_SVE_AAPCS_SaveList; return CSR_Darwin_AArch64_AAPCS_SaveList; } @@ -643,7 +642,7 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { if (ST.hasSVE() || ST.isStreaming()) { // Frames that have variable sized objects and scalable SVE objects, // should always use a basepointer. - if (!AFI->hasCalculatedStackSizeSVE() || AFI->getStackSizeSVE()) + if (!AFI->hasCalculatedStackSizeSVE() || AFI->hasSVEStackSize()) return true; } @@ -783,7 +782,7 @@ AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const { assert((!MF.getSubtarget<AArch64Subtarget>().hasSVE() || AFI->hasCalculatedStackSizeSVE()) && "Expected SVE area to be calculated by this point"); - return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->getStackSizeSVE() && + return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->hasSVEStackSize() && !AFI->hasStackHazardSlotIndex(); } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 7ee54c5..c197550e 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -438,7 +438,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder({G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2, G_FLOG10, G_FTAN, G_FEXP, G_FEXP2, G_FEXP10, G_FACOS, G_FASIN, G_FATAN, G_FATAN2, G_FCOSH, - G_FSINH, G_FTANH}) + G_FSINH, G_FTANH, G_FMODF}) // We need a call for these, so we always need to scalarize. .scalarize(0) // Regardless of FP16 support, widen 16-bit elements to 32-bits. |