aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AArch64
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AArch64')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp10
-rw-r--r--llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp33
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp100
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.h5
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td5
-rw-r--r--llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp12
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp88
-rw-r--r--llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td12
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetMachine.cpp1
11 files changed, 220 insertions, 50 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index cb831963..7712d2a 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -629,8 +629,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
}
const MCInstrDesc &MCID = TII->get(Opc);
// Create a dummy virtual register for the SUBS def.
- Register DestReg =
- MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI));
+ Register DestReg = MRI->createVirtualRegister(TII->getRegClass(MCID, 0));
// Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
BuildMI(*Head, Head->end(), TermDL, MCID)
.addReg(DestReg, RegState::Define | RegState::Dead)
@@ -638,8 +637,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
.addImm(0)
.addImm(0);
// SUBS uses the GPR*sp register classes.
- MRI->constrainRegClass(HeadCond[2].getReg(),
- TII->getRegClass(MCID, 1, TRI));
+ MRI->constrainRegClass(HeadCond[2].getReg(), TII->getRegClass(MCID, 1));
}
Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end());
@@ -686,10 +684,10 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
const MCInstrDesc &MCID = TII->get(Opc);
MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(),
- TII->getRegClass(MCID, 0, TRI));
+ TII->getRegClass(MCID, 0));
if (CmpMI->getOperand(FirstOp + 1).isReg())
MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(),
- TII->getRegClass(MCID, 1, TRI));
+ TII->getRegClass(MCID, 1));
MachineInstrBuilder MIB = BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
.add(CmpMI->getOperand(FirstOp)); // Register Rn
if (isZBranch)
diff --git a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 75361f5..4ff49a6 100644
--- a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -156,7 +156,7 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
LLVM_DEBUG(dbgs() << " Ignoring, def is tied operand.\n");
continue;
}
- const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI);
+ const TargetRegisterClass *RC = TII->getRegClass(Desc, I);
unsigned NewReg;
if (RC == nullptr) {
LLVM_DEBUG(dbgs() << " Ignoring, register is not a GPR.\n");
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 76a790dc..8457f61 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22308,6 +22308,37 @@ static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
}
+// Attempt to combine the following patterns:
+// SUB x, (CSET LO, (CMP a, b)) -> SBC x, 0, (CMP a, b)
+// SUB (SUB x, y), (CSET LO, (CMP a, b)) -> SBC x, y, (CMP a, b)
+// The CSET may be preceded by a ZEXT.
+static SDValue performSubWithBorrowCombine(SDNode *N, SelectionDAG &DAG) {
+ if (N->getOpcode() != ISD::SUB)
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
+ SDValue N1 = N->getOperand(1);
+ if (N1.getOpcode() == ISD::ZERO_EXTEND && N1.hasOneUse())
+ N1 = N1.getOperand(0);
+ if (!N1.hasOneUse() || getCSETCondCode(N1) != AArch64CC::LO)
+ return SDValue();
+
+ SDValue Flags = N1.getOperand(3);
+ if (Flags.getOpcode() != AArch64ISD::SUBS)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ if (N0->getOpcode() == ISD::SUB)
+ return DAG.getNode(AArch64ISD::SBC, DL, VT, N0.getOperand(0),
+ N0.getOperand(1), Flags);
+ return DAG.getNode(AArch64ISD::SBC, DL, VT, N0, DAG.getConstant(0, DL, VT),
+ Flags);
+}
+
static SDValue performAddSubCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
// Try to change sum of two reductions.
@@ -22329,6 +22360,8 @@ static SDValue performAddSubCombine(SDNode *N,
return Val;
if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
return Val;
+ if (SDValue Val = performSubWithBorrowCombine(N, DCI.DAG))
+ return Val;
if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
return Val;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 4b40733..b93e562 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -91,7 +91,7 @@ static cl::opt<unsigned> GatherOptSearchLimit(
"machine-combiner gather pattern optimization"));
AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
- : AArch64GenInstrInfo(STI, AArch64::ADJCALLSTACKDOWN,
+ : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
@@ -1780,6 +1780,16 @@ static unsigned sForm(MachineInstr &Instr) {
case AArch64::SUBSWri:
case AArch64::SUBSXrr:
case AArch64::SUBSXri:
+ case AArch64::ANDSWri:
+ case AArch64::ANDSWrr:
+ case AArch64::ANDSWrs:
+ case AArch64::ANDSXri:
+ case AArch64::ANDSXrr:
+ case AArch64::ANDSXrs:
+ case AArch64::BICSWrr:
+ case AArch64::BICSXrr:
+ case AArch64::BICSWrs:
+ case AArch64::BICSXrs:
return Instr.getOpcode();
case AArch64::ADDWrr:
@@ -1810,6 +1820,22 @@ static unsigned sForm(MachineInstr &Instr) {
return AArch64::ANDSWri;
case AArch64::ANDXri:
return AArch64::ANDSXri;
+ case AArch64::ANDWrr:
+ return AArch64::ANDSWrr;
+ case AArch64::ANDWrs:
+ return AArch64::ANDSWrs;
+ case AArch64::ANDXrr:
+ return AArch64::ANDSXrr;
+ case AArch64::ANDXrs:
+ return AArch64::ANDSXrs;
+ case AArch64::BICWrr:
+ return AArch64::BICSWrr;
+ case AArch64::BICXrr:
+ return AArch64::BICSXrr;
+ case AArch64::BICWrs:
+ return AArch64::BICSWrs;
+ case AArch64::BICXrs:
+ return AArch64::BICSXrs;
}
}
@@ -1947,6 +1973,25 @@ static bool isSUBSRegImm(unsigned Opcode) {
return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
}
+static bool isANDOpcode(MachineInstr &MI) {
+ unsigned Opc = sForm(MI);
+ switch (Opc) {
+ case AArch64::ANDSWri:
+ case AArch64::ANDSWrr:
+ case AArch64::ANDSWrs:
+ case AArch64::ANDSXri:
+ case AArch64::ANDSXrr:
+ case AArch64::ANDSXrs:
+ case AArch64::BICSWrr:
+ case AArch64::BICSXrr:
+ case AArch64::BICSWrs:
+ case AArch64::BICSXrs:
+ return true;
+ default:
+ return false;
+ }
+}
+
/// Check if CmpInstr can be substituted by MI.
///
/// CmpInstr can be substituted:
@@ -1984,7 +2029,8 @@ static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
// 1) MI and CmpInstr set N and V to the same value.
// 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
// signed overflow occurs, so CmpInstr could still be simplified away.
- if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap))
+ // Note that Ands and Bics instructions always clear the V flag.
+ if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI))
return false;
AccessKind AccessToCheck = AK_Write;
@@ -5618,7 +5664,6 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
Register SrcReg, bool isKill, int FI,
const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI,
Register VReg,
MachineInstr::MIFlag Flags) const {
MachineFunction &MF = *MBB.getParent();
@@ -5632,7 +5677,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
bool Offset = true;
MCRegister PNRReg = MCRegister::NoRegister;
unsigned StackID = TargetStackID::Default;
- switch (TRI->getSpillSize(*RC)) {
+ switch (RI.getSpillSize(*RC)) {
case 1:
if (AArch64::FPR8RegClass.hasSubClassEq(RC))
Opc = AArch64::STRBui;
@@ -5795,10 +5840,12 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
.addMemOperand(MMO);
}
-void AArch64InstrInfo::loadRegFromStackSlot(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
- int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
- Register VReg, MachineInstr::MIFlag Flags) const {
+void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ Register DestReg, int FI,
+ const TargetRegisterClass *RC,
+ Register VReg,
+ MachineInstr::MIFlag Flags) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
@@ -5810,7 +5857,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
bool Offset = true;
unsigned StackID = TargetStackID::Default;
Register PNRReg = MCRegister::NoRegister;
- switch (TRI->getSpillSize(*RC)) {
+ switch (TRI.getSpillSize(*RC)) {
case 1:
if (AArch64::FPR8RegClass.hasSubClassEq(RC))
Opc = AArch64::LDRBui;
@@ -6446,10 +6493,10 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
"Mismatched register size in non subreg COPY");
if (IsSpill)
storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
- getRegClass(SrcReg), &TRI, Register());
+ getRegClass(SrcReg), Register());
else
loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
- getRegClass(DstReg), &TRI, Register());
+ getRegClass(DstReg), Register());
return &*--InsertPt;
}
@@ -6467,8 +6514,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
assert(SrcMO.getSubReg() == 0 &&
"Unexpected subreg on physical register");
storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
- FrameIndex, &AArch64::GPR64RegClass, &TRI,
- Register());
+ FrameIndex, &AArch64::GPR64RegClass, Register());
return &*--InsertPt;
}
@@ -6502,7 +6548,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
TRI.getRegSizeInBits(*FillRC) &&
"Mismatched regclass size on folded subreg COPY");
- loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI,
+ loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC,
Register());
MachineInstr &LoadMI = *--InsertPt;
MachineOperand &LoadDst = LoadMI.getOperand(0);
@@ -9590,6 +9636,27 @@ AArch64InstrInfo::getOutliningCandidateInfo(
unsigned NumBytesToCreateFrame = 0;
+ // Avoid splitting ADRP ADD/LDR pair into outlined functions.
+ // These instructions are fused together by the scheduler.
+ // Any candidate where ADRP is the last instruction should be rejected
+ // as that will lead to splitting ADRP pair.
+ MachineInstr &LastMI = RepeatedSequenceLocs[0].back();
+ MachineInstr &FirstMI = RepeatedSequenceLocs[0].front();
+ if (LastMI.getOpcode() == AArch64::ADRP &&
+ (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_PAGE) != 0 &&
+ (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_GOT) != 0) {
+ return std::nullopt;
+ }
+
+ // Similarly any candidate where the first instruction is ADD/LDR with a
+ // page offset should be rejected to avoid ADRP splitting.
+ if ((FirstMI.getOpcode() == AArch64::ADDXri ||
+ FirstMI.getOpcode() == AArch64::LDRXui) &&
+ (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 &&
+ (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) != 0) {
+ return std::nullopt;
+ }
+
// We only allow outlining for functions having exactly matching return
// address signing attributes, i.e., all share the same value for the
// attribute "sign-return-address" and all share the same type of key they
@@ -10996,8 +11063,6 @@ static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
MachineBasicBlock::iterator InsertTo) {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
- const TargetRegisterInfo *TRI =
- MBB.getParent()->getSubtarget().getRegisterInfo();
MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
Register Result = 0;
for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
@@ -11006,8 +11071,7 @@ static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
MRI.getRegClass(NewMI->getOperand(0).getReg()));
NewMI->getOperand(I).setReg(Result);
} else if (I == ReplaceOprNum) {
- MRI.constrainRegClass(ReplaceReg,
- TII->getRegClass(NewMI->getDesc(), I, TRI));
+ MRI.constrainRegClass(ReplaceReg, TII->getRegClass(NewMI->getDesc(), I));
NewMI->getOperand(I).setReg(ReplaceReg);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 179574a..979c9ac 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -353,14 +353,13 @@ public:
void storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
- bool isKill, int FrameIndex, const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI, Register VReg,
+ bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
void loadRegFromStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI, Register VReg,
+ Register VReg,
MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
// This tells target independent code that it is okay to pass instructions
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 76f076a..b30e3d0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4444,6 +4444,11 @@ defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
[(AArch64Prefetch timm:$Rt,
(am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+// PRFM falls back to PRFUM for negative or unaligned offsets (not a multiple
+// of 8).
+def : InstAlias<"prfm $Rt, [$Rn, $offset]",
+ (PRFUMi prfop:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+
//---
// (unscaled immediate, unprivileged)
defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 04e76c7..d25db89 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -595,17 +595,17 @@ bool AArch64MIPeepholeOpt::splitTwoPartImm(
// Determine register classes for destinations and register operands
const TargetRegisterClass *FirstInstrDstRC =
- TII->getRegClass(TII->get(Opcode.first), 0, TRI);
+ TII->getRegClass(TII->get(Opcode.first), 0);
const TargetRegisterClass *FirstInstrOperandRC =
- TII->getRegClass(TII->get(Opcode.first), 1, TRI);
+ TII->getRegClass(TII->get(Opcode.first), 1);
const TargetRegisterClass *SecondInstrDstRC =
(Opcode.first == Opcode.second)
? FirstInstrDstRC
- : TII->getRegClass(TII->get(Opcode.second), 0, TRI);
+ : TII->getRegClass(TII->get(Opcode.second), 0);
const TargetRegisterClass *SecondInstrOperandRC =
(Opcode.first == Opcode.second)
? FirstInstrOperandRC
- : TII->getRegClass(TII->get(Opcode.second), 1, TRI);
+ : TII->getRegClass(TII->get(Opcode.second), 1);
// Get old registers destinations and new register destinations
Register DstReg = MI.getOperand(0).getReg();
@@ -784,14 +784,14 @@ bool AArch64MIPeepholeOpt::visitUBFMXri(MachineInstr &MI) {
}
const TargetRegisterClass *DstRC64 =
- TII->getRegClass(TII->get(MI.getOpcode()), 0, TRI);
+ TII->getRegClass(TII->get(MI.getOpcode()), 0);
const TargetRegisterClass *DstRC32 =
TRI->getSubRegisterClass(DstRC64, AArch64::sub_32);
assert(DstRC32 && "Destination register class of UBFMXri doesn't have a "
"sub_32 subregister class");
const TargetRegisterClass *SrcRC64 =
- TII->getRegClass(TII->get(MI.getOpcode()), 1, TRI);
+ TII->getRegClass(TII->get(MI.getOpcode()), 1);
const TargetRegisterClass *SrcRC32 =
TRI->getSubRegisterClass(SrcRC64, AArch64::sub_32);
assert(SrcRC32 && "Source register class of UBFMXri doesn't have a sub_32 "
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index a5048b9..f3cf222 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -897,7 +897,7 @@ AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
const MCInstrDesc &MCID = TII->get(AArch64::ADDXri);
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
Register BaseReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
- MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this));
+ MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0));
unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
BuildMI(*MBB, Ins, DL, MCID, BaseReg)
@@ -1123,24 +1123,85 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
}
}
-// FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register allocation
-// where a consecutive multi-vector tuple is constructed from the same indices
-// of multiple strided loads. This may still result in unnecessary copies
-// between the loads and the tuple. Here we try to return a hint to assign the
-// contiguous ZPRMulReg starting at the same register as the first operand of
-// the pseudo, which should be a subregister of the first strided load.
+// We add regalloc hints for different cases:
+// * Choosing a better destination operand for predicated SVE instructions
+// where the inactive lanes are undef, by choosing a register that is not
+// unique to the other operands of the instruction.
//
-// For example, if the first strided load has been assigned $z16_z20_z24_z28
-// and the operands of the pseudo are each accessing subregister zsub2, we
-// should look through through Order to find a contiguous register which
-// begins with $z24 (i.e. $z24_z25_z26_z27).
+// * Improve register allocation for SME multi-vector instructions where we can
+// benefit from the strided- and contiguous register multi-vector tuples.
//
+// Here FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register
+// allocation where a consecutive multi-vector tuple is constructed from the
+// same indices of multiple strided loads. This may still result in
+// unnecessary copies between the loads and the tuple. Here we try to return a
+// hint to assign the contiguous ZPRMulReg starting at the same register as
+// the first operand of the pseudo, which should be a subregister of the first
+// strided load.
+//
+// For example, if the first strided load has been assigned $z16_z20_z24_z28
+// and the operands of the pseudo are each accessing subregister zsub2, we
+// should look through through Order to find a contiguous register which
+// begins with $z24 (i.e. $z24_z25_z26_z27).
bool AArch64RegisterInfo::getRegAllocationHints(
Register VirtReg, ArrayRef<MCPhysReg> Order,
SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
-
auto &ST = MF.getSubtarget<AArch64Subtarget>();
+ const AArch64InstrInfo *TII =
+ MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // For predicated SVE instructions where the inactive lanes are undef,
+ // pick a destination register that is not unique to avoid introducing
+ // a movprfx.
+ const TargetRegisterClass *RegRC = MRI.getRegClass(VirtReg);
+ if (AArch64::ZPRRegClass.hasSubClassEq(RegRC)) {
+ for (const MachineOperand &DefOp : MRI.def_operands(VirtReg)) {
+ const MachineInstr &Def = *DefOp.getParent();
+ if (DefOp.isImplicit() ||
+ (TII->get(Def.getOpcode()).TSFlags & AArch64::FalseLanesMask) !=
+ AArch64::FalseLanesUndef)
+ continue;
+
+ unsigned InstFlags =
+ TII->get(AArch64::getSVEPseudoMap(Def.getOpcode())).TSFlags;
+
+ for (MCPhysReg R : Order) {
+ auto AddHintIfSuitable = [&](MCPhysReg R, const MachineOperand &MO) {
+ // R is a suitable register hint if there exists an operand for the
+ // instruction that is not yet allocated a register or if R matches
+ // one of the other source operands.
+ if (!VRM->hasPhys(MO.getReg()) || VRM->getPhys(MO.getReg()) == R)
+ Hints.push_back(R);
+ };
+
+ switch (InstFlags & AArch64::DestructiveInstTypeMask) {
+ default:
+ break;
+ case AArch64::DestructiveTernaryCommWithRev:
+ AddHintIfSuitable(R, Def.getOperand(2));
+ AddHintIfSuitable(R, Def.getOperand(3));
+ AddHintIfSuitable(R, Def.getOperand(4));
+ break;
+ case AArch64::DestructiveBinaryComm:
+ case AArch64::DestructiveBinaryCommWithRev:
+ AddHintIfSuitable(R, Def.getOperand(2));
+ AddHintIfSuitable(R, Def.getOperand(3));
+ break;
+ case AArch64::DestructiveBinary:
+ case AArch64::DestructiveBinaryImm:
+ AddHintIfSuitable(R, Def.getOperand(2));
+ break;
+ }
+ }
+ }
+
+ if (Hints.size())
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
+ MF, VRM);
+ }
+
if (!ST.hasSME() || !ST.isStreaming())
return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
VRM);
@@ -1153,8 +1214,7 @@ bool AArch64RegisterInfo::getRegAllocationHints(
// FORM_TRANSPOSED_REG_TUPLE pseudo, we want to favour reducing copy
// instructions over reducing the number of clobbered callee-save registers,
// so we add the strided registers as a hint.
- const MachineRegisterInfo &MRI = MF.getRegInfo();
- unsigned RegID = MRI.getRegClass(VirtReg)->getID();
+ unsigned RegID = RegRC->getID();
if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID ||
RegID == AArch64::ZPR4StridedOrContiguousRegClassID) {
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index e1f4386..65b6077 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3597,6 +3597,18 @@ let Predicates = [HasSVE_or_SME] in {
def : Pat<(sext (i32 (vector_extract nxv4i32:$vec, VectorIndexS:$index))),
(SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
+
+ // Extracts of ``unsigned'' i8 or i16 elements lead to the zero-extend being
+ // transformed to an AND mask. The mask is redundant since UMOV already zeroes
+ // the high bits of the destination register.
+ def : Pat<(i32 (and (vector_extract nxv16i8:$vec, VectorIndexB:$index), 0xff)),
+ (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
+ def : Pat<(i32 (and (vector_extract nxv8i16:$vec, VectorIndexH:$index), 0xffff)),
+ (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)>;
+ def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index)))), (i64 0xff))),
+ (SUBREG_TO_REG (i64 0), (i32 (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)), sub_32)>;
+ def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract nxv8i16:$vec, VectorIndexH:$index)))), (i64 0xffff))),
+ (SUBREG_TO_REG (i64 0), (i32 (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)), sub_32)>;
} // End HasNEON
// Extract first element from vector.
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 8974965..ab4004e 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -157,7 +157,7 @@ public:
bool enableMachineScheduler() const override { return true; }
bool enablePostRAScheduler() const override { return usePostRAScheduler(); }
bool enableSubRegLiveness() const override { return EnableSubregLiveness; }
-
+ bool enableTerminalRule() const override { return true; }
bool enableMachinePipeliner() const override;
bool useDFAforSMS() const override { return false; }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 068954f..0bf2b31 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -54,7 +54,6 @@
#include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
#include <memory>
#include <optional>
-#include <string>
using namespace llvm;