aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp')
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp351
1 files changed, 252 insertions, 99 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5b5565a..043be55 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -9,8 +9,8 @@
#include "AArch64TargetTransformInfo.h"
#include "AArch64ExpandImm.h"
#include "AArch64PerfectShuffle.h"
+#include "AArch64SMEAttributes.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
-#include "Utils/AArch64SMEAttributes.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -77,6 +77,10 @@ static cl::opt<unsigned> DMBLookaheadThreshold(
"dmb-lookahead-threshold", cl::init(10), cl::Hidden,
cl::desc("The number of instructions to search for a redundant dmb"));
+static cl::opt<int> Aarch64ForceUnrollThreshold(
+ "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
+ cl::desc("Threshold for forced unrolling of small loops in AArch64"));
+
namespace {
class TailFoldingOption {
// These bitfields will only ever be set to something non-zero in operator=,
@@ -248,12 +252,23 @@ static bool hasPossibleIncompatibleOps(const Function *F,
return false;
}
-APInt AArch64TTIImpl::getFeatureMask(const Function &F) const {
+static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI,
+ SmallVectorImpl<StringRef> &Features) {
StringRef AttributeStr =
- isMultiversionedFunction(F) ? "fmv-features" : "target-features";
+ TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
- SmallVector<StringRef, 8> Features;
FeatureStr.split(Features, ",");
+}
+
+APInt AArch64TTIImpl::getFeatureMask(const Function &F) const {
+ SmallVector<StringRef, 8> Features;
+ extractAttrFeatures(F, this, Features);
+ return AArch64::getCpuSupportsMask(Features);
+}
+
+APInt AArch64TTIImpl::getPriorityMask(const Function &F) const {
+ SmallVector<StringRef, 8> Features;
+ extractAttrFeatures(F, this, Features);
return AArch64::getFMVPriority(Features);
}
@@ -371,8 +386,13 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
TargetTransformInfo::RegisterKind K) const {
assert(K != TargetTransformInfo::RGK_Scalar);
- return (K == TargetTransformInfo::RGK_FixedWidthVector &&
- ST->isNeonAvailable());
+
+ if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
+ return true;
+
+ return K == TargetTransformInfo::RGK_ScalableVector &&
+ ST->isSVEorStreamingSVEAvailable() &&
+ !ST->disableMaximizeScalableBandwidth();
}
/// Calculate the cost of materializing a 64-bit value. This helper
@@ -917,8 +937,20 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
if (ICA.getArgs().empty())
break;
- // TODO: Add handling for fshl where third argument is not a constant.
const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
+
+ // ROTR / ROTL is a funnel shift with equal first and second operand. For
+ // ROTR on integer registers (i32/i64) this can be done in a single ror
+ // instruction. A fshl with a non-constant shift uses a neg + ror.
+ if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
+ (RetTy->getPrimitiveSizeInBits() == 32 ||
+ RetTy->getPrimitiveSizeInBits() == 64)) {
+ InstructionCost NegCost =
+ (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
+ return 1 + NegCost;
+ }
+
+ // TODO: Add handling for fshl where third argument is not a constant.
if (!OpInfoZ.isConstant())
break;
@@ -1425,10 +1457,22 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
case Intrinsic::aarch64_sve_orr:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
.setMatchingIROpcode(Instruction::Or);
+ case Intrinsic::aarch64_sve_sqrshl:
+ return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqrshl_u);
+ case Intrinsic::aarch64_sve_sqshl:
+ return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqshl_u);
case Intrinsic::aarch64_sve_sqsub:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
+ case Intrinsic::aarch64_sve_srshl:
+ return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_srshl_u);
+ case Intrinsic::aarch64_sve_uqrshl:
+ return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqrshl_u);
+ case Intrinsic::aarch64_sve_uqshl:
+ return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqshl_u);
case Intrinsic::aarch64_sve_uqsub:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
+ case Intrinsic::aarch64_sve_urshl:
+ return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_urshl_u);
case Intrinsic::aarch64_sve_add_u:
return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
@@ -1870,25 +1914,23 @@ static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
IntrinsicInst &II) {
- IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
- if (!Pg)
- return std::nullopt;
+ Value *Pg = II.getOperand(1);
- if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
- return std::nullopt;
+ // sve.dup(V, all_active, X) ==> splat(X)
+ if (isAllActivePredicate(Pg)) {
+ auto *RetTy = cast<ScalableVectorType>(II.getType());
+ Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
+ II.getArgOperand(2));
+ return IC.replaceInstUsesWith(II, Splat);
+ }
- const auto PTruePattern =
- cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
- if (PTruePattern != AArch64SVEPredPattern::vl1)
+ if (!match(Pg, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
+ m_SpecificInt(AArch64SVEPredPattern::vl1))))
return std::nullopt;
- // The intrinsic is inserting into lane zero so use an insert instead.
- auto *IdxTy = Type::getInt64Ty(II.getContext());
- auto *Insert = InsertElementInst::Create(
- II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
- Insert->insertBefore(II.getIterator());
- Insert->takeName(&II);
-
+ // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
+ Value *Insert = IC.Builder.CreateInsertElement(
+ II.getArgOperand(0), II.getArgOperand(2), uint64_t(0));
return IC.replaceInstUsesWith(II, Insert);
}
@@ -3007,9 +3049,9 @@ AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
llvm_unreachable("Unsupported register kind");
}
-bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
- ArrayRef<const Value *> Args,
- Type *SrcOverrideTy) const {
+bool AArch64TTIImpl::isSingleExtWideningInstruction(
+ unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
+ Type *SrcOverrideTy) const {
// A helper that returns a vector type from the given type. The number of
// elements in type Ty determines the vector width.
auto toVectorTy = [&](Type *ArgTy) {
@@ -3027,48 +3069,29 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
(DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
return false;
- // Determine if the operation has a widening variant. We consider both the
- // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
- // instructions.
- //
- // TODO: Add additional widening operations (e.g., shl, etc.) once we
- // verify that their extending operands are eliminated during code
- // generation.
Type *SrcTy = SrcOverrideTy;
switch (Opcode) {
- case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
- case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
+ case Instruction::Add: // UADDW(2), SADDW(2).
+ case Instruction::Sub: { // USUBW(2), SSUBW(2).
// The second operand needs to be an extend
if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
if (!SrcTy)
SrcTy =
toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
- } else
+ break;
+ }
+
+ if (Opcode == Instruction::Sub)
return false;
- break;
- case Instruction::Mul: { // SMULL(2), UMULL(2)
- // Both operands need to be extends of the same type.
- if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
- (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
+
+ // UADDW(2), SADDW(2) can be commutted.
+ if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
if (!SrcTy)
SrcTy =
toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
- } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
- // If one of the operands is a Zext and the other has enough zero bits to
- // be treated as unsigned, we can still general a umull, meaning the zext
- // is free.
- KnownBits Known =
- computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
- if (Args[0]->getType()->getScalarSizeInBits() -
- Known.Zero.countLeadingOnes() >
- DstTy->getScalarSizeInBits() / 2)
- return false;
- if (!SrcTy)
- SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
- DstTy->getScalarSizeInBits() / 2));
- } else
- return false;
- break;
+ break;
+ }
+ return false;
}
default:
return false;
@@ -3099,6 +3122,73 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
}
+Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
+ ArrayRef<const Value *> Args,
+ Type *SrcOverrideTy) const {
+ if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
+ Opcode != Instruction::Mul)
+ return nullptr;
+
+ // Exit early if DstTy is not a vector type whose elements are one of [i16,
+ // i32, i64]. SVE doesn't generally have the same set of instructions to
+ // perform an extend with the add/sub/mul. There are SMULLB style
+ // instructions, but they operate on top/bottom, requiring some sort of lane
+ // interleaving to be used with zext/sext.
+ unsigned DstEltSize = DstTy->getScalarSizeInBits();
+ if (!useNeonVector(DstTy) || Args.size() != 2 ||
+ (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
+ return nullptr;
+
+ auto getScalarSizeWithOverride = [&](const Value *V) {
+ if (SrcOverrideTy)
+ return SrcOverrideTy->getScalarSizeInBits();
+ return cast<Instruction>(V)
+ ->getOperand(0)
+ ->getType()
+ ->getScalarSizeInBits();
+ };
+
+ unsigned MaxEltSize = 0;
+ if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
+ (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
+ unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
+ unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
+ MaxEltSize = std::max(EltSize0, EltSize1);
+ } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
+ isa<SExtInst, ZExtInst>(Args[1])) {
+ unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
+ unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
+ // mul(sext, zext) will become smull(sext, zext) if the extends are large
+ // enough.
+ if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
+ return nullptr;
+ MaxEltSize = DstEltSize / 2;
+ } else if (Opcode == Instruction::Mul &&
+ (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
+ // If one of the operands is a Zext and the other has enough zero bits
+ // to be treated as unsigned, we can still generate a umull, meaning the
+ // zext is free.
+ KnownBits Known =
+ computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
+ if (Args[0]->getType()->getScalarSizeInBits() -
+ Known.Zero.countLeadingOnes() >
+ DstTy->getScalarSizeInBits() / 2)
+ return nullptr;
+
+ MaxEltSize =
+ getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
+ } else
+ return nullptr;
+
+ if (MaxEltSize * 2 > DstEltSize)
+ return nullptr;
+
+ Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
+ if (ExtTy->getPrimitiveSizeInBits() <= 64)
+ return nullptr;
+ return ExtTy;
+}
+
// s/urhadd instructions implement the following pattern, making the
// extends free:
// %x = add ((zext i8 -> i16), 1)
@@ -3159,7 +3249,24 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
if (I && I->hasOneUser()) {
auto *SingleUser = cast<Instruction>(*I->user_begin());
SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
- if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
+ if (Type *ExtTy = isBinExtWideningInstruction(
+ SingleUser->getOpcode(), Dst, Operands,
+ Src != I->getOperand(0)->getType() ? Src : nullptr)) {
+ // The cost from Src->Src*2 needs to be added if required, the cost from
+ // Src*2->ExtTy is free.
+ if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
+ Type *DoubleSrcTy =
+ Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
+ return getCastInstrCost(Opcode, DoubleSrcTy, Src,
+ TTI::CastContextHint::None, CostKind);
+ }
+
+ return 0;
+ }
+
+ if (isSingleExtWideningInstruction(
+ SingleUser->getOpcode(), Dst, Operands,
+ Src != I->getOperand(0)->getType() ? Src : nullptr)) {
// For adds only count the second operand as free if both operands are
// extends but not the same operation. (i.e both operands are not free in
// add(sext, zext)).
@@ -3168,8 +3275,11 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
(isa<CastInst>(SingleUser->getOperand(1)) &&
cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
return 0;
- } else // Others are free so long as isWideningInstruction returned true.
+ } else {
+ // Others are free so long as isSingleExtWideningInstruction
+ // returned true.
return 0;
+ }
}
// The cast will be free for the s/urhadd instructions
@@ -4095,12 +4205,15 @@ InstructionCost AArch64TTIImpl::getScalarizationOverhead(
std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
- TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
+ TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
std::function<InstructionCost(Type *)> InstCost) const {
if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
return std::nullopt;
if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
return std::nullopt;
+ if (CanUseSVE && Ty->isScalableTy() && ST->hasSVEB16B16() &&
+ ST->isNonStreamingSVEorSME2Available())
+ return std::nullopt;
Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
@@ -4142,12 +4255,26 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
ISD == ISD::FDIV || ISD == ISD::FREM)
if (auto PromotedCost = getFP16BF16PromoteCost(
Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
+ // There is not native support for fdiv/frem even with +sve-b16b16.
+ /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
[&](Type *PromotedTy) {
return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
Op1Info, Op2Info);
}))
return *PromotedCost;
+ // If the operation is a widening instruction (smull or umull) and both
+ // operands are extends the cost can be cheaper by considering that the
+ // operation will operate on the narrowest type size possible (double the
+ // largest input size) and a further extend.
+ if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
+ if (ExtTy != Ty)
+ return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
+ getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
+ TTI::CastContextHint::None, CostKind);
+ return LT.first;
+ }
+
switch (ISD) {
default:
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
@@ -4381,10 +4508,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
// - two 2-cost i64 inserts, and
// - two 1-cost muls.
// So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
- // LT.first = 2 the cost is 28. If both operands are extensions it will not
- // need to scalarize so the cost can be cheaper (smull or umull).
- // so the cost can be cheaper (smull or umull).
- if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
+ // LT.first = 2 the cost is 28.
+ if (LT.second != MVT::v2i64)
return LT.first;
return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
(getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
@@ -4546,7 +4671,8 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
if (Opcode == Instruction::FCmp) {
if (auto PromotedCost = getFP16BF16PromoteCost(
ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
- [&](Type *PromotedTy) {
+ // TODO: Consider costing SVE FCMPs.
+ /*CanUseSVE=*/false, [&](Type *PromotedTy) {
InstructionCost Cost =
getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
CostKind, Op1Info, Op2Info);
@@ -4642,12 +4768,26 @@ bool AArch64TTIImpl::prefersVectorizedAddressing() const {
}
InstructionCost
-AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
- Align Alignment, unsigned AddressSpace,
+AArch64TTIImpl::getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
+ TTI::TargetCostKind CostKind) const {
+ switch (MICA.getID()) {
+ case Intrinsic::masked_scatter:
+ case Intrinsic::masked_gather:
+ return getGatherScatterOpCost(MICA, CostKind);
+ case Intrinsic::masked_load:
+ case Intrinsic::masked_store:
+ return getMaskedMemoryOpCost(MICA, CostKind);
+ }
+ return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
+}
+
+InstructionCost
+AArch64TTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
TTI::TargetCostKind CostKind) const {
+ Type *Src = MICA.getDataType();
+
if (useNeonVector(Src))
- return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
- CostKind);
+ return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
auto LT = getTypeLegalizationCost(Src);
if (!LT.first.isValid())
return InstructionCost::getInvalid();
@@ -4689,12 +4829,21 @@ static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
}
}
-InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
- unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
- Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
+InstructionCost
+AArch64TTIImpl::getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
+ TTI::TargetCostKind CostKind) const {
+
+ unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
+ MICA.getID() == Intrinsic::vp_gather)
+ ? Instruction::Load
+ : Instruction::Store;
+
+ Type *DataTy = MICA.getDataType();
+ Align Alignment = MICA.getAlignment();
+ const Instruction *I = MICA.getInst();
+
if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
- return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
- Alignment, CostKind, I);
+ return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
auto *VT = cast<VectorType>(DataTy);
auto LT = getTypeLegalizationCost(DataTy);
if (!LT.first.isValid())
@@ -5172,6 +5321,7 @@ void AArch64TTIImpl::getUnrollingPreferences(
// inlining. Don't unroll auto-vectorized loops either, though do allow
// unrolling of the scalar remainder.
bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
+ InstructionCost Cost = 0;
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
// Both auto-vectorized loops and the scalar remainder have the
@@ -5186,24 +5336,19 @@ void AArch64TTIImpl::getUnrollingPreferences(
continue;
return;
}
+
+ SmallVector<const Value *, 4> Operands(I.operand_values());
+ Cost += getInstructionCost(&I, Operands,
+ TargetTransformInfo::TCK_SizeAndLatency);
}
}
// Apply subtarget-specific unrolling preferences.
- switch (ST->getProcFamily()) {
- case AArch64Subtarget::AppleA14:
- case AArch64Subtarget::AppleA15:
- case AArch64Subtarget::AppleA16:
- case AArch64Subtarget::AppleM4:
+ if (ST->isAppleMLike())
getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
- break;
- case AArch64Subtarget::Falkor:
- if (EnableFalkorHWPFUnrollFix)
- getFalkorUnrollingPreferences(L, SE, UP);
- break;
- default:
- break;
- }
+ else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
+ EnableFalkorHWPFUnrollFix)
+ getFalkorUnrollingPreferences(L, SE, UP);
// If this is a small, multi-exit loop similar to something like std::find,
// then there is typically a performance improvement achieved by unrolling.
@@ -5232,6 +5377,11 @@ void AArch64TTIImpl::getUnrollingPreferences(
UP.UnrollAndJam = true;
UP.UnrollAndJamInnerLoopThreshold = 60;
}
+
+ // Force unrolling small loops can be very useful because of the branch
+ // taken cost of the backedge.
+ if (Cost < Aarch64ForceUnrollThreshold)
+ UP.Force = true;
}
void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
@@ -5902,6 +6052,15 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
SrcTy = DstTy;
}
+ // Check for identity masks, which we can treat as free for both fixed and
+ // scalable vector paths.
+ if (!Mask.empty() && LT.second.isFixedLengthVector() &&
+ (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
+ all_of(enumerate(Mask), [](const auto &M) {
+ return M.value() < 0 || M.value() == (int)M.index();
+ }))
+ return 0;
+
// Segmented shuffle matching.
if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
!Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
@@ -5949,21 +6108,13 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
all_of(Mask, [](int E) { return E < 8; }))
return getPerfectShuffleCost(Mask);
- // Check for identity masks, which we can treat as free.
- if (!Mask.empty() && LT.second.isFixedLengthVector() &&
- (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
- all_of(enumerate(Mask), [](const auto &M) {
- return M.value() < 0 || M.value() == (int)M.index();
- }))
- return 0;
-
// Check for other shuffles that are not SK_ kinds but we have native
// instructions for, for example ZIP and UZP.
unsigned Unused;
if (LT.second.isFixedLengthVector() &&
LT.second.getVectorNumElements() == Mask.size() &&
(Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
- (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
+ (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
isREVMask(Mask, LT.second.getScalarSizeInBits(),
LT.second.getVectorNumElements(), 16) ||
@@ -6129,7 +6280,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
}
static bool containsDecreasingPointers(Loop *TheLoop,
- PredicatedScalarEvolution *PSE) {
+ PredicatedScalarEvolution *PSE,
+ const DominatorTree &DT) {
const auto &Strides = DenseMap<Value *, const SCEV *>();
for (BasicBlock *BB : TheLoop->blocks()) {
// Scan the instructions in the block and look for addresses that are
@@ -6138,8 +6290,8 @@ static bool containsDecreasingPointers(Loop *TheLoop,
if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
Value *Ptr = getLoadStorePointerOperand(&I);
Type *AccessTy = getLoadStoreType(&I);
- if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
- /*ShouldCheckWrap=*/false)
+ if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
+ /*Assume=*/true, /*ShouldCheckWrap=*/false)
.value_or(0) < 0)
return true;
}
@@ -6184,7 +6336,8 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
// negative strides. This will require extra work to reverse the loop
// predicate, which may be expensive.
if (containsDecreasingPointers(TFI->LVL->getLoop(),
- TFI->LVL->getPredicatedScalarEvolution()))
+ TFI->LVL->getPredicatedScalarEvolution(),
+ *TFI->LVL->getDominatorTree()))
Required |= TailFoldingOpts::Reverse;
if (Required == TailFoldingOpts::Disabled)
Required |= TailFoldingOpts::Simple;