aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AArch64
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AArch64')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp84
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp49
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h10
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp265
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.h4
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td27
-rw-r--r--llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp26
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.h3
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp8
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp3
11 files changed, 385 insertions, 96 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 7de66cc..201bfe0 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -598,6 +598,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
llvm_unreachable("Unsupported ElementSize");
}
+ // Preserve undef state until DOP's reg is defined.
+ unsigned DOPRegState = MI.getOperand(DOPIdx).isUndef() ? RegState::Undef : 0;
+
//
// Create the destructive operation (if required)
//
@@ -616,10 +619,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxZero))
.addReg(DstReg, RegState::Define)
.addReg(MI.getOperand(PredIdx).getReg())
- .addReg(MI.getOperand(DOPIdx).getReg());
+ .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState);
// After the movprfx, the destructive operand is same as Dst
DOPIdx = 0;
+ DOPRegState = 0;
// Create the additional LSL to zero the lanes when the DstReg is not
// unique. Zeros the lanes in z0 that aren't active in p0 with sequence
@@ -638,8 +642,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
assert(DOPRegIsUnique && "The destructive operand should be unique");
PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx))
.addReg(DstReg, RegState::Define)
- .addReg(MI.getOperand(DOPIdx).getReg());
+ .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState);
DOPIdx = 0;
+ DOPRegState = 0;
}
//
@@ -647,10 +652,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
//
DOP = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode))
.addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead));
+ DOPRegState = DOPRegState | RegState::Kill;
switch (DType) {
case AArch64::DestructiveUnaryPassthru:
- DOP.addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
+ DOP.addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState)
.add(MI.getOperand(PredIdx))
.add(MI.getOperand(SrcIdx));
break;
@@ -659,12 +665,12 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
case AArch64::DestructiveBinaryComm:
case AArch64::DestructiveBinaryCommWithRev:
DOP.add(MI.getOperand(PredIdx))
- .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
- .add(MI.getOperand(SrcIdx));
+ .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState)
+ .add(MI.getOperand(SrcIdx));
break;
case AArch64::DestructiveTernaryCommWithRev:
DOP.add(MI.getOperand(PredIdx))
- .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
+ .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState)
.add(MI.getOperand(SrcIdx))
.add(MI.getOperand(Src2Idx));
break;
@@ -1199,32 +1205,36 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
Register DstReg = MI.getOperand(0).getReg();
if (DstReg == MI.getOperand(3).getReg()) {
// Expand to BIT
- BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8
- : AArch64::BITv16i8))
- .add(MI.getOperand(0))
- .add(MI.getOperand(3))
- .add(MI.getOperand(2))
- .add(MI.getOperand(1));
+ auto I = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8
+ : AArch64::BITv16i8))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(1));
+ transferImpOps(MI, I, I);
} else if (DstReg == MI.getOperand(2).getReg()) {
// Expand to BIF
- BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8
- : AArch64::BIFv16i8))
- .add(MI.getOperand(0))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3))
- .add(MI.getOperand(1));
+ auto I = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8
+ : AArch64::BIFv16i8))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(1));
+ transferImpOps(MI, I, I);
} else {
// Expand to BSL, use additional move if required
if (DstReg == MI.getOperand(1).getReg()) {
- BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
- : AArch64::BSLv16i8))
- .add(MI.getOperand(0))
- .add(MI.getOperand(1))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3));
+ auto I =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+ : AArch64::BSLv16i8))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3));
+ transferImpOps(MI, I, I);
} else {
BuildMI(MBB, MBBI, MI.getDebugLoc(),
TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8
@@ -1234,15 +1244,17 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
getRenamableRegState(MI.getOperand(0).isRenamable()))
.add(MI.getOperand(1))
.add(MI.getOperand(1));
- BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
- : AArch64::BSLv16i8))
- .add(MI.getOperand(0))
- .addReg(DstReg,
- RegState::Kill |
- getRenamableRegState(MI.getOperand(0).isRenamable()))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3));
+ auto I2 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+ : AArch64::BSLv16i8))
+ .add(MI.getOperand(0))
+ .addReg(DstReg,
+ RegState::Kill | getRenamableRegState(
+ MI.getOperand(0).isRenamable()))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3));
+ transferImpOps(MI, I2, I2);
}
}
MI.eraseFromParent();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4f13a14..f026726 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6439,7 +6439,9 @@ bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
}
}
- return true;
+ EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();
+ return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||
+ PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;
}
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
@@ -17155,7 +17157,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool AArch64TargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -17163,6 +17165,11 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(Load);
+ if (!LI)
+ return false;
+ assert(!Mask && "Unexpected mask on a load");
+
const DataLayout &DL = LI->getDataLayout();
VectorType *VTy = Shuffles[0]->getType();
@@ -17486,9 +17493,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
- Instruction *Load, Value *Mask,
- ArrayRef<Value *> DeinterleavedValues) const {
- unsigned Factor = DeinterleavedValues.size();
+ Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
+ const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
if (Factor != 2 && Factor != 4) {
LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
return false;
@@ -17498,9 +17504,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
return false;
assert(!Mask && "Unexpected mask on a load\n");
- Value *FirstActive = *llvm::find_if(DeinterleavedValues,
- [](Value *V) { return V != nullptr; });
- VectorType *VTy = cast<VectorType>(FirstActive->getType());
+ VectorType *VTy = getDeinterleavedVectorType(DI);
const DataLayout &DL = LI->getModule()->getDataLayout();
bool UseScalable;
@@ -17528,6 +17532,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
Value *BaseAddr = LI->getPointerOperand();
+ Value *Result = nullptr;
if (NumLoads > 1) {
// Create multiple legal small ldN.
SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
@@ -17548,35 +17553,35 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
}
- // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
- for (unsigned J = 0; J < Factor; ++J) {
- if (DeinterleavedValues[J])
- DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
- }
+
+ // Merge the values from different factors.
+ Result = PoisonValue::get(DI->getType());
+ for (unsigned J = 0; J < Factor; ++J)
+ Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
} else {
- Value *Result;
if (UseScalable)
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
else
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
- // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
- for (unsigned I = 0; I < Factor; I++) {
- if (DeinterleavedValues[I]) {
- Value *NewExtract = Builder.CreateExtractValue(Result, I);
- DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
- }
- }
}
+
+ // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
+ DI->replaceAllUsesWith(Result);
return true;
}
bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
- StoreInst *SI, ArrayRef<Value *> InterleavedValues) const {
+ Instruction *Store, Value *Mask,
+ ArrayRef<Value *> InterleavedValues) const {
unsigned Factor = InterleavedValues.size();
if (Factor != 2 && Factor != 4) {
LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
return false;
}
+ StoreInst *SI = dyn_cast<StoreInst>(Store);
+ if (!SI)
+ return false;
+ assert(!Mask && "Unexpected mask on plain store");
VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
const DataLayout &DL = SI->getModule()->getDataLayout();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6afb3c3..713793e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -211,19 +211,19 @@ public:
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(
- Instruction *Load, Value *Mask,
- ArrayRef<Value *> DeinterleaveValues) const override;
+ bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
+ IntrinsicInst *DI) const override;
bool lowerInterleaveIntrinsicToStore(
- StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
+ Instruction *Store, Value *Mask,
+ ArrayRef<Value *> InterleaveValues) const override;
bool isLegalAddImmediate(int64_t) const override;
bool isLegalAddScalableImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 996b0ed..bc57537 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,6 +20,7 @@
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CFIInstBuilder.h"
#include "llvm/CodeGen/LivePhysRegs.h"
@@ -35,6 +36,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
@@ -7351,6 +7353,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i8:
return true;
} // end switch (Pattern)
return false;
@@ -7391,11 +7396,252 @@ static bool getMiscPatterns(MachineInstr &Root,
return false;
}
+static bool getGatherPattern(MachineInstr &Root,
+ SmallVectorImpl<unsigned> &Patterns,
+ unsigned LoadLaneOpCode, unsigned NumLanes) {
+ const MachineFunction *MF = Root.getMF();
+
+ // Early exit if optimizing for size.
+ if (MF->getFunction().hasMinSize())
+ return false;
+
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+ // The root of the pattern must load into the last lane of the vector.
+ if (Root.getOperand(2).getImm() != NumLanes - 1)
+ return false;
+
+ // Check that we have load into all lanes except lane 0.
+ // For each load we also want to check that:
+ // 1. It has a single non-debug use (since we will be replacing the virtual
+ // register)
+ // 2. That the addressing mode only uses a single offset register.
+ auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+ auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
+ SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end());
+ while (!RemainingLanes.empty() && CurrInstr &&
+ CurrInstr->getOpcode() == LoadLaneOpCode &&
+ MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
+ CurrInstr->getNumOperands() == 4) {
+ RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
+ CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+ }
+
+ if (!RemainingLanes.empty())
+ return false;
+
+ // Match the SUBREG_TO_REG sequence.
+ if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
+ return false;
+
+ // Verify that the subreg to reg loads an integer into the first lane.
+ auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
+ unsigned SingleLaneSizeInBits = 128 / NumLanes;
+ if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
+ return false;
+
+ // Verify that it also has a single non debug use.
+ if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
+ return false;
+
+ switch (NumLanes) {
+ case 4:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32);
+ break;
+ case 8:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16);
+ break;
+ case 16:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8);
+ break;
+ default:
+ llvm_unreachable("Got bad number of lanes for gather pattern.");
+ }
+
+ return true;
+}
+
+/// Search for patterns where we use LD1 instructions to load into
+/// separate lanes of an 128 bit Neon register. We can increase Memory Level
+/// Parallelism by loading into 2 Neon registers instead.
+static bool getLoadPatterns(MachineInstr &Root,
+ SmallVectorImpl<unsigned> &Patterns) {
+
+ // The pattern searches for loads into single lanes.
+ switch (Root.getOpcode()) {
+ case AArch64::LD1i32:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
+ case AArch64::LD1i16:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
+ case AArch64::LD1i8:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
+ default:
+ return false;
+ }
+}
+
+static void
+generateGatherPattern(MachineInstr &Root,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<Register, unsigned> &InstrIdxForVirtReg,
+ unsigned Pattern, unsigned NumLanes) {
+
+ MachineFunction &MF = *Root.getParent()->getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+ // Gather the initial load instructions to build the pattern
+ SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
+ MachineInstr *CurrInstr = &Root;
+ for (unsigned i = 0; i < NumLanes - 1; ++i) {
+ LoadToLaneInstrs.push_back(CurrInstr);
+ CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+ }
+
+ // Sort the load instructions according to the lane.
+ llvm::sort(LoadToLaneInstrs,
+ [](const MachineInstr *A, const MachineInstr *B) {
+ return A->getOperand(2).getImm() > B->getOperand(2).getImm();
+ });
+
+ MachineInstr *SubregToReg = CurrInstr;
+ LoadToLaneInstrs.push_back(
+ MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
+ auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
+
+ const TargetRegisterClass *FPR128RegClass =
+ MRI.getRegClass(Root.getOperand(0).getReg());
+
+ auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
+ Register SrcRegister, unsigned Lane,
+ Register OffsetRegister) {
+ auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
+ MachineInstrBuilder LoadIndexIntoRegister =
+ BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
+ NewRegister)
+ .addReg(SrcRegister)
+ .addImm(Lane)
+ .addReg(OffsetRegister, getKillRegState(true));
+ InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
+ InsInstrs.push_back(LoadIndexIntoRegister);
+ return NewRegister;
+ };
+
+ // Helper to create load instruction based on opcode
+ auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg,
+ Register OffsetReg) -> MachineInstrBuilder {
+ unsigned Opcode;
+ switch (NumLanes) {
+ case 4:
+ Opcode = AArch64::LDRSui;
+ break;
+ case 8:
+ Opcode = AArch64::LDRHui;
+ break;
+ case 16:
+ Opcode = AArch64::LDRBui;
+ break;
+ default:
+ llvm_unreachable(
+ "Got unsupported number of lanes in machine-combiner gather pattern");
+ }
+ // Immediate offset load
+ return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
+ .addReg(OffsetReg)
+ .addImm(0); // immediate offset
+ };
+
+ // Load the remaining lanes into register 0.
+ auto LanesToLoadToReg0 =
+ llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
+ LoadToLaneInstrsAscending.begin() + NumLanes / 2);
+ auto PrevReg = SubregToReg->getOperand(0).getReg();
+ for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
+ PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
+ LoadInstr->getOperand(3).getReg());
+ DelInstrs.push_back(LoadInstr);
+ }
+ auto LastLoadReg0 = PrevReg;
+
+ // First load into register 1. Perform a LDRSui to zero out the upper lanes in
+ // a single instruction.
+ auto Lane0Load = *LoadToLaneInstrsAscending.begin();
+ auto OriginalSplitLoad =
+ *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
+ auto DestRegForMiddleIndex = MRI.createVirtualRegister(
+ MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
+
+ MachineInstrBuilder MiddleIndexLoadInstr =
+ CreateLoadInstruction(NumLanes, DestRegForMiddleIndex,
+ OriginalSplitLoad->getOperand(3).getReg());
+
+ InstrIdxForVirtReg.insert(
+ std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
+ InsInstrs.push_back(MiddleIndexLoadInstr);
+ DelInstrs.push_back(OriginalSplitLoad);
+
+ // Subreg To Reg instruction for register 1.
+ auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
+ unsigned SubregType;
+ switch (NumLanes) {
+ case 4:
+ SubregType = AArch64::ssub;
+ break;
+ case 8:
+ SubregType = AArch64::hsub;
+ break;
+ case 16:
+ SubregType = AArch64::bsub;
+ break;
+ default:
+ llvm_unreachable(
+ "Got invalid NumLanes for machine-combiner gather pattern");
+ }
+
+ auto SubRegToRegInstr =
+ BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
+ DestRegForSubregToReg)
+ .addImm(0)
+ .addReg(DestRegForMiddleIndex, getKillRegState(true))
+ .addImm(SubregType);
+ InstrIdxForVirtReg.insert(
+ std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
+ InsInstrs.push_back(SubRegToRegInstr);
+
+ // Load remaining lanes into register 1.
+ auto LanesToLoadToReg1 =
+ llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
+ LoadToLaneInstrsAscending.end());
+ PrevReg = SubRegToRegInstr->getOperand(0).getReg();
+ for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
+ PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
+ LoadInstr->getOperand(3).getReg());
+ if (Index == NumLanes / 2 - 2) {
+ break;
+ }
+ DelInstrs.push_back(LoadInstr);
+ }
+ auto LastLoadReg1 = PrevReg;
+
+ // Create the final zip instruction to combine the results.
+ MachineInstrBuilder ZipInstr =
+ BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
+ Root.getOperand(0).getReg())
+ .addReg(LastLoadReg0)
+ .addReg(LastLoadReg1);
+ InsInstrs.push_back(ZipInstr);
+}
+
CombinerObjective
AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
switch (Pattern) {
case AArch64MachineCombinerPattern::SUBADD_OP1:
case AArch64MachineCombinerPattern::SUBADD_OP2:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i8:
return CombinerObjective::MustReduceDepth;
default:
return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7425,6 +7671,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
if (getMiscPatterns(Root, Patterns))
return true;
+ // Load patterns
+ if (getLoadPatterns(Root, Patterns))
+ return true;
+
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
DoRegPressureReduce);
}
@@ -8680,6 +8930,21 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
break;
}
+ case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
+ generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 4);
+ break;
+ }
+ case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
+ generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 8);
+ break;
+ }
+ case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
+ generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 16);
+ break;
+ }
} // end switch (Pattern)
// Record MUL and ADD/SUB for deletion
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 7c255da..02734866 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,6 +172,10 @@ enum AArch64MachineCombinerPattern : unsigned {
FMULv8i16_indexed_OP2,
FNMADD,
+
+ GATHER_LANE_i32,
+ GATHER_LANE_i16,
+ GATHER_LANE_i8
};
class AArch64InstrInfo final : public AArch64GenInstrInfo {
const AArch64RegisterInfo RI;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 6c46b18..9f8a257 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1053,13 +1053,6 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
def AArch64uaddlv : SDNode<"AArch64ISD::UADDLV", SDT_AArch64uaddlp>;
def AArch64saddlv : SDNode<"AArch64ISD::SADDLV", SDT_AArch64uaddlp>;
-def AArch64uabd : PatFrags<(ops node:$lhs, node:$rhs),
- [(abdu node:$lhs, node:$rhs),
- (int_aarch64_neon_uabd node:$lhs, node:$rhs)]>;
-def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs),
- [(abds node:$lhs, node:$rhs),
- (int_aarch64_neon_sabd node:$lhs, node:$rhs)]>;
-
// Add Pairwise of two vectors
def AArch64addp_n : SDNode<"AArch64ISD::ADDP", SDT_AArch64Zip>;
// Add Long Pairwise
@@ -5667,8 +5660,7 @@ let Predicates = [HasFullFP16] in {
// Advanced SIMD two vector instructions.
//===----------------------------------------------------------------------===//
-defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
- AArch64uabd>;
+defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", abdu>;
// Match UABDL in log2-shuffle patterns.
def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)),
(zext (v8i8 V64:$opB))))),
@@ -6018,8 +6010,8 @@ defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>;
defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
- TriOpFrag<(add node:$LHS, (AArch64sabd node:$MHS, node:$RHS))> >;
-defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", AArch64sabd>;
+ TriOpFrag<(add node:$LHS, (abds node:$MHS, node:$RHS))> >;
+defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", abds>;
defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", avgfloors>;
defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
@@ -6037,8 +6029,8 @@ defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>;
defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
- TriOpFrag<(add node:$LHS, (AArch64uabd node:$MHS, node:$RHS))> >;
-defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", AArch64uabd>;
+ TriOpFrag<(add node:$LHS, (abdu node:$MHS, node:$RHS))> >;
+defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", abdu>;
defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", avgflooru>;
defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
@@ -6759,10 +6751,8 @@ defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>
defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>;
-defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
- AArch64sabd>;
-defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
- AArch64sabd>;
+defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", abds>;
+defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", abds>;
defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl",
BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw",
@@ -6780,8 +6770,7 @@ defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
-defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
- AArch64uabd>;
+defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", abdu>;
defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
BinOpFrag<(add (zanyext node:$LHS), (zanyext node:$RHS))>>;
defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 0ddd17c..abcd550 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -8,8 +8,8 @@
//
// This pass performs below peephole optimizations on MIR level.
//
-// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
-// MOVi64imm + ANDXrr ==> ANDXri + ANDXri
+// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
+// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
//
// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
@@ -126,7 +126,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
template <typename T>
- bool visitAND(unsigned Opc, MachineInstr &MI);
+ bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0);
bool visitORR(MachineInstr &MI);
bool visitCSEL(MachineInstr &MI);
bool visitINSERT(MachineInstr &MI);
@@ -194,12 +194,12 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
}
template <typename T>
-bool AArch64MIPeepholeOpt::visitAND(
- unsigned Opc, MachineInstr &MI) {
+bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI,
+ unsigned OtherOpc) {
// Try below transformation.
//
- // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
- // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
+ // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
+ // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
//
// The mov pseudo instruction could be expanded to multiple mov instructions
// later. Let's try to split the constant operand of mov instruction into two
@@ -208,10 +208,10 @@ bool AArch64MIPeepholeOpt::visitAND(
return splitTwoPartImm<T>(
MI,
- [Opc](T Imm, unsigned RegSize, T &Imm0,
- T &Imm1) -> std::optional<OpcodePair> {
+ [Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0,
+ T &Imm1) -> std::optional<OpcodePair> {
if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
- return std::make_pair(Opc, Opc);
+ return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc);
return std::nullopt;
},
[&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
@@ -864,6 +864,12 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
case AArch64::ANDXrr:
Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
break;
+ case AArch64::ANDSWrr:
+ Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri);
+ break;
+ case AArch64::ANDSXrr:
+ Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri);
+ break;
case AArch64::ORRWrs:
Changed |= visitORR(MI);
break;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 2409cc8..0f4f012 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -534,7 +534,7 @@ unsigned AArch64Subtarget::classifyGlobalFunctionReference(
}
void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {
+ const SchedRegion &Region) const {
// LNT run (at least on Cyclone) showed reasonably significant gains for
// bi-directional scheduling. 253.perlbmk.
Policy.OnlyTopDown = false;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 154db3c..061ed61 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -343,7 +343,8 @@ public:
}
void overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const override;
+ const SchedRegion &Region) const override;
+
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
SDep &Dep,
const TargetSchedModel *SchedModel) const override;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 473ba5e..bb0f667b 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -287,6 +287,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.moreElementsToNextPow2(0)
.lower();
+ getActionDefinitionsBuilder({G_ABDS, G_ABDU})
+ .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
+ .lower();
+
getActionDefinitionsBuilder(
{G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
.legalFor({{s32, s32}, {s64, s32}})
@@ -1794,6 +1798,10 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return LowerBinOp(AArch64::G_SMULL);
case Intrinsic::aarch64_neon_umull:
return LowerBinOp(AArch64::G_UMULL);
+ case Intrinsic::aarch64_neon_sabd:
+ return LowerBinOp(TargetOpcode::G_ABDS);
+ case Intrinsic::aarch64_neon_uabd:
+ return LowerBinOp(TargetOpcode::G_ABDU);
case Intrinsic::aarch64_neon_abs: {
// Lower the intrinsic to G_ABS.
MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)});
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 233f42b..08f547a 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -559,8 +559,7 @@ void AArch64TargetELFStreamer::finish() {
if (!Sym.isMemtag())
continue;
auto *SRE = MCSymbolRefExpr::create(&Sym, Ctx);
- (void)S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE, SMLoc(),
- *Ctx.getSubtargetInfo());
+ S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE);
}
}