aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AArch64
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AArch64')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp51
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h1
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrGISel.td7
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp3
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp8
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp4
-rw-r--r--llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp70
-rw-r--r--llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp2
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp2
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp2
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp1
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h1
-rw-r--r--llvm/lib/Target/AArch64/MachineSMEABIPass.cpp108
-rw-r--r--llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp6
17 files changed, 209 insertions, 63 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6965116..662d84b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1561,6 +1561,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
@@ -1717,6 +1718,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
@@ -7775,6 +7777,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::VECREDUCE_FMAXIMUM:
case ISD::VECREDUCE_FMINIMUM:
return LowerVECREDUCE(Op, DAG);
+ case ISD::VECREDUCE_MUL:
+ case ISD::VECREDUCE_FMUL:
+ return LowerVECREDUCE_MUL(Op, DAG);
case ISD::ATOMIC_LOAD_AND:
return LowerATOMIC_LOAD_AND(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
@@ -16254,7 +16259,7 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
SplatVal > 1) {
SDValue Pg = getPredicateForScalableVector(DAG, DL, VT);
SDValue Res =
- DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
+ DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32));
if (Negated)
Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
@@ -16794,6 +16799,33 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
}
}
+SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ assert(SrcVT.isScalableVector() && "Unexpected operand type!");
+
+ SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT);
+ unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode());
+ SDValue Identity = DAG.getNeutralElement(BaseOpc, DL, SrcVT, Op->getFlags());
+
+ // Whilst we don't know the size of the vector we do know the maximum size so
+ // can perform a tree reduction with an identity vector, which means once we
+ // arrive at the result the remaining stages (when the vector is smaller than
+ // the maximum) have no affect.
+
+ unsigned Segments = AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;
+ unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements());
+
+ for (unsigned I = 0; I < Stages; ++I) {
+ Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity);
+ Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1));
+ }
+
+ return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0);
+}
+
SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
SelectionDAG &DAG) const {
auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
@@ -18144,8 +18176,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
- if (Factor != 2 && Factor != 4) {
- LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
+ if (Factor != 2 && Factor != 3 && Factor != 4) {
+ LLVM_DEBUG(dbgs() << "Matching ld2, ld3 and ld4 patterns failed\n");
return false;
}
auto *LI = dyn_cast<LoadInst>(Load);
@@ -18223,8 +18255,8 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
Instruction *Store, Value *Mask,
ArrayRef<Value *> InterleavedValues) const {
unsigned Factor = InterleavedValues.size();
- if (Factor != 2 && Factor != 4) {
- LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
+ if (Factor != 2 && Factor != 3 && Factor != 4) {
+ LLVM_DEBUG(dbgs() << "Matching st2, st3 and st4 patterns failed\n");
return false;
}
StoreInst *SI = dyn_cast<StoreInst>(Store);
@@ -22942,7 +22974,7 @@ static SDValue performIntrinsicCombine(SDNode *N,
return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_sve_asrd:
- return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
+ return DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2), N->getOperand(3));
case Intrinsic::aarch64_sve_cmphs:
if (!N->getOperand(2).getValueType().isFloatingPoint())
@@ -26196,9 +26228,10 @@ static SDValue performFlagSettingCombine(SDNode *N,
return DCI.CombineTo(N, Res, SDValue(N, 1));
}
- // Combine identical generic nodes into this node, re-using the result.
+ // Combine equivalent generic nodes into this node, re-using the result.
if (SDNode *Generic = DCI.DAG.getNodeIfExists(
- GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
+ GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS},
+ /*AllowCommute=*/true))
DCI.CombineTo(Generic, SDValue(N, 0));
return SDValue();
@@ -30046,7 +30079,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
SDValue Res =
- DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, ContainerVT, Pg, Op1, Op2);
+ DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, ContainerVT, Pg, Op1, Op2);
if (Negated)
Res = DAG.getNode(ISD::SUB, DL, ContainerVT,
DAG.getConstant(0, DL, ContainerVT), Res);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 00956fd..9495c9f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -752,6 +752,7 @@ private:
SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECREDUCE_MUL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 7322212..fe84193 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -233,6 +233,12 @@ def G_SDOT : AArch64GenericInstruction {
let hasSideEffects = 0;
}
+def G_USDOT : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3);
+ let hasSideEffects = 0;
+}
+
// Generic instruction for the BSP pseudo. It is expanded into BSP, which
// expands into BSL/BIT/BIF after register allocation.
def G_BSP : AArch64GenericInstruction {
@@ -278,6 +284,7 @@ def : GINodeEquiv<G_UADDLV, AArch64uaddlv>;
def : GINodeEquiv<G_UDOT, AArch64udot>;
def : GINodeEquiv<G_SDOT, AArch64sdot>;
+def : GINodeEquiv<G_USDOT, AArch64usdot>;
def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 30dfcf2b..12c600f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -10600,6 +10600,9 @@ describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
Register DestReg = DestSrc->Destination->getReg();
Register SrcReg = DestSrc->Source->getReg();
+ if (!DestReg.isValid() || !SrcReg.isValid())
+ return std::nullopt;
+
auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
// If the described register is the destination, just return the source.
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index b3c9656..343fd81 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -40,7 +40,11 @@ yaml::AArch64FunctionInfo::AArch64FunctionInfo(
getSVEStackSize(MFI, &llvm::AArch64FunctionInfo::getStackSizePPR)),
HasStackFrame(MFI.hasStackFrame()
? std::optional<bool>(MFI.hasStackFrame())
- : std::nullopt) {}
+ : std::nullopt),
+ HasStreamingModeChanges(
+ MFI.hasStreamingModeChanges()
+ ? std::optional<bool>(MFI.hasStreamingModeChanges())
+ : std::nullopt) {}
void yaml::AArch64FunctionInfo::mappingImpl(yaml::IO &YamlIO) {
MappingTraits<AArch64FunctionInfo>::mapping(YamlIO, *this);
@@ -55,6 +59,8 @@ void AArch64FunctionInfo::initializeBaseYamlFields(
YamlMFI.StackSizePPR.value_or(0));
if (YamlMFI.HasStackFrame)
setHasStackFrame(*YamlMFI.HasStackFrame);
+ if (YamlMFI.HasStreamingModeChanges)
+ setHasStreamingModeChanges(*YamlMFI.HasStreamingModeChanges);
}
static std::pair<bool, bool> GetSignReturnAddress(const Function &F) {
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index bd0a17d..d1832f4 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -645,6 +645,7 @@ struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo {
std::optional<uint64_t> StackSizeZPR;
std::optional<uint64_t> StackSizePPR;
std::optional<bool> HasStackFrame;
+ std::optional<bool> HasStreamingModeChanges;
AArch64FunctionInfo() = default;
AArch64FunctionInfo(const llvm::AArch64FunctionInfo &MFI);
@@ -659,6 +660,7 @@ template <> struct MappingTraits<AArch64FunctionInfo> {
YamlIO.mapOptional("stackSizeZPR", MFI.StackSizeZPR);
YamlIO.mapOptional("stackSizePPR", MFI.StackSizePPR);
YamlIO.mapOptional("hasStackFrame", MFI.HasStackFrame);
+ YamlIO.mapOptional("hasStreamingModeChanges", MFI.HasStreamingModeChanges);
}
};
diff --git a/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp b/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp
index cdf2822..a90950d 100644
--- a/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp
@@ -75,6 +75,10 @@ bool AArch64PostCoalescer::runOnMachineFunction(MachineFunction &MF) {
if (Src != Dst)
MRI->replaceRegWith(Dst, Src);
+ if (MI.getOperand(1).isUndef())
+ for (MachineOperand &MO : MRI->use_operands(Dst))
+ MO.setIsUndef();
+
// MI must be erased from the basic block before recalculating the live
// interval.
LIS->RemoveMachineInstrFromMaps(MI);
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index f110558..7e03b97 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -1360,14 +1360,24 @@ void AArch64EpilogueEmitter::emitEpilogue() {
}
bool CombineSPBump = shouldCombineCSRLocalStackBump(NumBytes);
- // Assume we can't combine the last pop with the sp restore.
- bool CombineAfterCSRBump = false;
+
+ unsigned ProloguePopSize = PrologueSaveSize;
if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
+ // With CalleeSavesAboveFrameRecord ProloguePopSize is the amount of stack
+ // that needs to be popped until we reach the start of the SVE save area.
+ // The "FixedObject" stack occurs after the SVE area and must be popped
+ // later.
+ ProloguePopSize -= FixedObject;
AfterCSRPopSize += FixedObject;
- } else if (!CombineSPBump && PrologueSaveSize != 0) {
+ }
+
+ // Assume we can't combine the last pop with the sp restore.
+ if (!CombineSPBump && ProloguePopSize != 0) {
MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
- AArch64InstrInfo::isSEHInstruction(*Pop))
+ AArch64InstrInfo::isSEHInstruction(*Pop) ||
+ (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord &&
+ isPartOfSVECalleeSaves(Pop)))
Pop = std::prev(Pop);
// Converting the last ldp to a post-index ldp is valid only if the last
// ldp's offset is 0.
@@ -1377,18 +1387,27 @@ void AArch64EpilogueEmitter::emitEpilogue() {
// may clobber), convert it to a post-index ldp.
if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) {
convertCalleeSaveRestoreToSPPrePostIncDec(
- Pop, DL, PrologueSaveSize, EmitCFI, MachineInstr::FrameDestroy,
- PrologueSaveSize);
+ Pop, DL, ProloguePopSize, EmitCFI, MachineInstr::FrameDestroy,
+ ProloguePopSize);
+ } else if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
+ MachineBasicBlock::iterator AfterLastPop = std::next(Pop);
+ if (AArch64InstrInfo::isSEHInstruction(*AfterLastPop))
+ ++AfterLastPop;
+ // If not, and CalleeSavesAboveFrameRecord is enabled, deallocate
+ // callee-save non-SVE registers to move the stack pointer to the start of
+ // the SVE area.
+ emitFrameOffset(MBB, AfterLastPop, DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(ProloguePopSize), TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI,
+ &HasWinCFI);
} else {
- // If not, make sure to emit an add after the last ldp.
+ // Otherwise, make sure to emit an add after the last ldp.
// We're doing this by transferring the size to be restored from the
// adjustment *before* the CSR pops to the adjustment *after* the CSR
// pops.
- AfterCSRPopSize += PrologueSaveSize;
- CombineAfterCSRBump = true;
+ AfterCSRPopSize += ProloguePopSize;
}
}
-
// Move past the restores of the callee-saved registers.
// If we plan on combining the sp bump of the local stack size and the callee
// save stack size, we might need to adjust the CSR save and restore offsets.
@@ -1419,6 +1438,17 @@ void AArch64EpilogueEmitter::emitEpilogue() {
--SEHEpilogueStartI;
}
+ // Determine the ranges of SVE callee-saves. This is done before emitting any
+ // code at the end of the epilogue (for Swift async), which can get in the way
+ // of finding SVE callee-saves with CalleeSavesAboveFrameRecord.
+ auto [PPR, ZPR] = getSVEStackFrameSizes();
+ auto [PPRRange, ZPRRange] = partitionSVECS(
+ MBB,
+ SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord
+ ? MBB.getFirstTerminator()
+ : FirstGPRRestoreI,
+ PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true);
+
if (HasFP && AFI->hasSwiftAsyncContext())
emitSwiftAsyncContextFramePointer(EpilogueEndI, DL);
@@ -1441,14 +1471,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
NumBytes -= PrologueSaveSize;
assert(NumBytes >= 0 && "Negative stack allocation size!?");
- auto [PPR, ZPR] = getSVEStackFrameSizes();
- auto [PPRRange, ZPRRange] = partitionSVECS(
- MBB,
- SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord
- ? MBB.getFirstTerminator()
- : FirstGPRRestoreI,
- PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true);
-
StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
StackOffset SVEStackSize =
SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize;
@@ -1467,16 +1489,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
NeedsWinCFI, &HasWinCFI);
}
- // Deallocate callee-save non-SVE registers.
- emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
- // Deallocate fixed objects.
- emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(FixedObject), TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
// Deallocate callee-save SVE registers.
emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false,
@@ -1619,7 +1631,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy,
false, NeedsWinCFI, &HasWinCFI, EmitCFI,
- StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0));
+ StackOffset::getFixed(AfterCSRPopSize - ArgumentStackToRestore));
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index bc6b931..98a128e 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -265,7 +265,7 @@ def SDT_AArch64Arith_Imm : SDTypeProfile<1, 3, [
SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>
]>;
-def AArch64asrd_m1 : SDNode<"AArch64ISD::SRAD_MERGE_OP1", SDT_AArch64Arith_Imm>;
+def AArch64asrd_m1 : SDNode<"AArch64ISD::ASRD_MERGE_OP1", SDT_AArch64Arith_Imm>;
def AArch64urshri_p_node : SDNode<"AArch64ISD::URSHR_I_PRED", SDT_AArch64Arith_Imm>;
def AArch64urshri_p : PatFrags<(ops node:$op1, node:$op2, node:$op3),
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 12ddf47..53b00e8 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -273,7 +273,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
EpilogueVectorizationMinVF = 8;
MaxInterleaveFactor = 4;
ScatterOverhead = 13;
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case NeoverseN2:
case NeoverseN3:
PrefFunctionAlignment = Align(16);
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 2c3870c..636d4f8a 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -8217,6 +8217,8 @@ bool AArch64AsmParser::parseDataExpr(const MCExpr *&Res) {
Spec = AArch64::S_GOTPCREL;
else if (Identifier == "plt")
Spec = AArch64::S_PLT;
+ else if (Identifier == "funcinit")
+ Spec = AArch64::S_FUNCINIT;
}
if (Spec == AArch64::S_None)
return Error(Loc, "invalid relocation specifier");
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 9e2d698..05a4313 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1855,6 +1855,8 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return LowerTriOp(AArch64::G_UDOT);
case Intrinsic::aarch64_neon_sdot:
return LowerTriOp(AArch64::G_SDOT);
+ case Intrinsic::aarch64_neon_usdot:
+ return LowerTriOp(AArch64::G_USDOT);
case Intrinsic::aarch64_neon_sqxtn:
return LowerUnaryOp(TargetOpcode::G_TRUNC_SSAT_S);
case Intrinsic::aarch64_neon_sqxtun:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index a388216..892b8da 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -232,6 +232,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(const MCFixup &Fixup,
}
if (RefKind == AArch64::S_AUTH || RefKind == AArch64::S_AUTHADDR)
return ELF::R_AARCH64_AUTH_ABS64;
+ if (RefKind == AArch64::S_FUNCINIT)
+ return ELF::R_AARCH64_FUNCINIT64;
return ELF::R_AARCH64_ABS64;
}
case AArch64::fixup_aarch64_add_imm12:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 2b5cf34..bc090c6 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -40,6 +40,7 @@ const MCAsmInfo::AtSpecifier ELFAtSpecifiers[] = {
{AArch64::S_GOT, "GOT"},
{AArch64::S_GOTPCREL, "GOTPCREL"},
{AArch64::S_PLT, "PLT"},
+ {AArch64::S_FUNCINIT, "FUNCINIT"},
};
const MCAsmInfo::AtSpecifier MachOAtSpecifiers[] = {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 0dfa61b..f2acff5 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -164,6 +164,7 @@ enum {
// ELF relocation specifiers in data directives:
S_PLT = 0x400,
S_GOTPCREL,
+ S_FUNCINIT,
// Mach-O @ relocation specifiers:
S_MACHO_GOT,
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 4749748..434ea67 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -294,6 +294,12 @@ struct MachineSMEABI : public MachineFunctionPass {
MachineBasicBlock::iterator MBBI,
LiveRegs PhysLiveRegs);
+ /// Attempts to find an insertion point before \p Inst where the status flags
+ /// are not live. If \p Inst is `Block.Insts.end()` a point before the end of
+ /// the block is found.
+ std::pair<MachineBasicBlock::iterator, LiveRegs>
+ findStateChangeInsertionPoint(MachineBasicBlock &MBB, const BlockInfo &Block,
+ SmallVectorImpl<InstInfo>::const_iterator Inst);
void emitStateChange(EmitContext &, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, ZAState From,
ZAState To, LiveRegs PhysLiveRegs);
@@ -337,6 +343,28 @@ private:
MachineRegisterInfo *MRI = nullptr;
};
+static LiveRegs getPhysLiveRegs(LiveRegUnits const &LiveUnits) {
+ LiveRegs PhysLiveRegs = LiveRegs::None;
+ if (!LiveUnits.available(AArch64::NZCV))
+ PhysLiveRegs |= LiveRegs::NZCV;
+ // We have to track W0 and X0 separately as otherwise things can get
+ // confused if we attempt to preserve X0 but only W0 was defined.
+ if (!LiveUnits.available(AArch64::W0))
+ PhysLiveRegs |= LiveRegs::W0;
+ if (!LiveUnits.available(AArch64::W0_HI))
+ PhysLiveRegs |= LiveRegs::W0_HI;
+ return PhysLiveRegs;
+}
+
+static void setPhysLiveRegs(LiveRegUnits &LiveUnits, LiveRegs PhysLiveRegs) {
+ if (PhysLiveRegs & LiveRegs::NZCV)
+ LiveUnits.addReg(AArch64::NZCV);
+ if (PhysLiveRegs & LiveRegs::W0)
+ LiveUnits.addReg(AArch64::W0);
+ if (PhysLiveRegs & LiveRegs::W0_HI)
+ LiveUnits.addReg(AArch64::W0_HI);
+}
+
FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
assert((SMEFnAttrs.hasAgnosticZAInterface() || SMEFnAttrs.hasZT0State() ||
SMEFnAttrs.hasZAState()) &&
@@ -362,26 +390,13 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
LiveRegUnits LiveUnits(*TRI);
LiveUnits.addLiveOuts(MBB);
- auto GetPhysLiveRegs = [&] {
- LiveRegs PhysLiveRegs = LiveRegs::None;
- if (!LiveUnits.available(AArch64::NZCV))
- PhysLiveRegs |= LiveRegs::NZCV;
- // We have to track W0 and X0 separately as otherwise things can get
- // confused if we attempt to preserve X0 but only W0 was defined.
- if (!LiveUnits.available(AArch64::W0))
- PhysLiveRegs |= LiveRegs::W0;
- if (!LiveUnits.available(AArch64::W0_HI))
- PhysLiveRegs |= LiveRegs::W0_HI;
- return PhysLiveRegs;
- };
-
- Block.PhysLiveRegsAtExit = GetPhysLiveRegs();
+ Block.PhysLiveRegsAtExit = getPhysLiveRegs(LiveUnits);
auto FirstTerminatorInsertPt = MBB.getFirstTerminator();
auto FirstNonPhiInsertPt = MBB.getFirstNonPHI();
for (MachineInstr &MI : reverse(MBB)) {
MachineBasicBlock::iterator MBBI(MI);
LiveUnits.stepBackward(MI);
- LiveRegs PhysLiveRegs = GetPhysLiveRegs();
+ LiveRegs PhysLiveRegs = getPhysLiveRegs(LiveUnits);
// The SMEStateAllocPseudo marker is added to a function if the save
// buffer was allocated in SelectionDAG. It marks the end of the
// allocation -- which is a safe point for this pass to insert any TPIDR2
@@ -476,6 +491,49 @@ MachineSMEABI::assignBundleZAStates(const EdgeBundles &Bundles,
return BundleStates;
}
+std::pair<MachineBasicBlock::iterator, LiveRegs>
+MachineSMEABI::findStateChangeInsertionPoint(
+ MachineBasicBlock &MBB, const BlockInfo &Block,
+ SmallVectorImpl<InstInfo>::const_iterator Inst) {
+ LiveRegs PhysLiveRegs;
+ MachineBasicBlock::iterator InsertPt;
+ if (Inst != Block.Insts.end()) {
+ InsertPt = Inst->InsertPt;
+ PhysLiveRegs = Inst->PhysLiveRegs;
+ } else {
+ InsertPt = MBB.getFirstTerminator();
+ PhysLiveRegs = Block.PhysLiveRegsAtExit;
+ }
+
+ if (!(PhysLiveRegs & LiveRegs::NZCV))
+ return {InsertPt, PhysLiveRegs}; // Nothing to do (no live flags).
+
+ // Find the previous state change. We can not move before this point.
+ MachineBasicBlock::iterator PrevStateChangeI;
+ if (Inst == Block.Insts.begin()) {
+ PrevStateChangeI = MBB.begin();
+ } else {
+ // Note: `std::prev(Inst)` is the previous InstInfo. We only create an
+ // InstInfo object for instructions that require a specific ZA state, so the
+ // InstInfo is the site of the previous state change in the block (which can
+ // be several MIs earlier).
+ PrevStateChangeI = std::prev(Inst)->InsertPt;
+ }
+
+ // Note: LiveUnits will only accurately track X0 and NZCV.
+ LiveRegUnits LiveUnits(*TRI);
+ setPhysLiveRegs(LiveUnits, PhysLiveRegs);
+ for (MachineBasicBlock::iterator I = InsertPt; I != PrevStateChangeI; --I) {
+ // Don't move before/into a call (which may have a state change before it).
+ if (I->getOpcode() == TII->getCallFrameDestroyOpcode() || I->isCall())
+ break;
+ LiveUnits.stepBackward(*I);
+ if (LiveUnits.available(AArch64::NZCV))
+ return {I, getPhysLiveRegs(LiveUnits)};
+ }
+ return {InsertPt, PhysLiveRegs};
+}
+
void MachineSMEABI::insertStateChanges(EmitContext &Context,
const FunctionInfo &FnInfo,
const EdgeBundles &Bundles,
@@ -490,10 +548,13 @@ void MachineSMEABI::insertStateChanges(EmitContext &Context,
CurrentState = InState;
for (auto &Inst : Block.Insts) {
- if (CurrentState != Inst.NeededState)
- emitStateChange(Context, MBB, Inst.InsertPt, CurrentState,
- Inst.NeededState, Inst.PhysLiveRegs);
- CurrentState = Inst.NeededState;
+ if (CurrentState != Inst.NeededState) {
+ auto [InsertPt, PhysLiveRegs] =
+ findStateChangeInsertionPoint(MBB, Block, &Inst);
+ emitStateChange(Context, MBB, InsertPt, CurrentState, Inst.NeededState,
+ PhysLiveRegs);
+ CurrentState = Inst.NeededState;
+ }
}
if (MBB.succ_empty())
@@ -501,9 +562,12 @@ void MachineSMEABI::insertStateChanges(EmitContext &Context,
ZAState OutState =
BundleStates[Bundles.getBundle(MBB.getNumber(), /*Out=*/true)];
- if (CurrentState != OutState)
- emitStateChange(Context, MBB, MBB.getFirstTerminator(), CurrentState,
- OutState, Block.PhysLiveRegsAtExit);
+ if (CurrentState != OutState) {
+ auto [InsertPt, PhysLiveRegs] =
+ findStateChangeInsertionPoint(MBB, Block, Block.Insts.end());
+ emitStateChange(Context, MBB, InsertPt, CurrentState, OutState,
+ PhysLiveRegs);
+ }
}
}
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index dd6fa16..d71f728 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -130,6 +130,12 @@ SMECallAttrs::SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI)
if (auto *CalledFunction = CB.getCalledFunction())
CalledFn = SMEAttrs(*CalledFunction, TLI);
+ // An `invoke` of an agnostic ZA function may not return normally (it may
+ // resume in an exception block). In this case, it acts like a private ZA
+ // callee and may require a ZA save to be set up before it is called.
+ if (isa<InvokeInst>(CB))
+ CalledFn.set(SMEAttrs::ZA_State_Agnostic, /*Enable=*/false);
+
// FIXME: We probably should not allow SME attributes on direct calls but
// clang duplicates streaming mode attributes at each callsite.
assert((IsIndirect ||