diff options
Diffstat (limited to 'llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp')
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 351 |
1 files changed, 252 insertions, 99 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 5b5565a..043be55 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -9,8 +9,8 @@ #include "AArch64TargetTransformInfo.h" #include "AArch64ExpandImm.h" #include "AArch64PerfectShuffle.h" +#include "AArch64SMEAttributes.h" #include "MCTargetDesc/AArch64AddressingModes.h" -#include "Utils/AArch64SMEAttributes.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -77,6 +77,10 @@ static cl::opt<unsigned> DMBLookaheadThreshold( "dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb")); +static cl::opt<int> Aarch64ForceUnrollThreshold( + "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, + cl::desc("Threshold for forced unrolling of small loops in AArch64")); + namespace { class TailFoldingOption { // These bitfields will only ever be set to something non-zero in operator=, @@ -248,12 +252,23 @@ static bool hasPossibleIncompatibleOps(const Function *F, return false; } -APInt AArch64TTIImpl::getFeatureMask(const Function &F) const { +static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI, + SmallVectorImpl<StringRef> &Features) { StringRef AttributeStr = - isMultiversionedFunction(F) ? "fmv-features" : "target-features"; + TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features"; StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString(); - SmallVector<StringRef, 8> Features; FeatureStr.split(Features, ","); +} + +APInt AArch64TTIImpl::getFeatureMask(const Function &F) const { + SmallVector<StringRef, 8> Features; + extractAttrFeatures(F, this, Features); + return AArch64::getCpuSupportsMask(Features); +} + +APInt AArch64TTIImpl::getPriorityMask(const Function &F) const { + SmallVector<StringRef, 8> Features; + extractAttrFeatures(F, this, Features); return AArch64::getFMVPriority(Features); } @@ -371,8 +386,13 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call, bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( TargetTransformInfo::RegisterKind K) const { assert(K != TargetTransformInfo::RGK_Scalar); - return (K == TargetTransformInfo::RGK_FixedWidthVector && - ST->isNeonAvailable()); + + if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable()) + return true; + + return K == TargetTransformInfo::RGK_ScalableVector && + ST->isSVEorStreamingSVEAvailable() && + !ST->disableMaximizeScalableBandwidth(); } /// Calculate the cost of materializing a 64-bit value. This helper @@ -917,8 +937,20 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (ICA.getArgs().empty()) break; - // TODO: Add handling for fshl where third argument is not a constant. const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]); + + // ROTR / ROTL is a funnel shift with equal first and second operand. For + // ROTR on integer registers (i32/i64) this can be done in a single ror + // instruction. A fshl with a non-constant shift uses a neg + ror. + if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] && + (RetTy->getPrimitiveSizeInBits() == 32 || + RetTy->getPrimitiveSizeInBits() == 64)) { + InstructionCost NegCost = + (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0; + return 1 + NegCost; + } + + // TODO: Add handling for fshl where third argument is not a constant. if (!OpInfoZ.isConstant()) break; @@ -1425,10 +1457,22 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) { case Intrinsic::aarch64_sve_orr: return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u) .setMatchingIROpcode(Instruction::Or); + case Intrinsic::aarch64_sve_sqrshl: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqrshl_u); + case Intrinsic::aarch64_sve_sqshl: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqshl_u); case Intrinsic::aarch64_sve_sqsub: return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u); + case Intrinsic::aarch64_sve_srshl: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_srshl_u); + case Intrinsic::aarch64_sve_uqrshl: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqrshl_u); + case Intrinsic::aarch64_sve_uqshl: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqshl_u); case Intrinsic::aarch64_sve_uqsub: return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u); + case Intrinsic::aarch64_sve_urshl: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_urshl_u); case Intrinsic::aarch64_sve_add_u: return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( @@ -1870,25 +1914,23 @@ static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC, static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II) { - IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); - if (!Pg) - return std::nullopt; + Value *Pg = II.getOperand(1); - if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) - return std::nullopt; + // sve.dup(V, all_active, X) ==> splat(X) + if (isAllActivePredicate(Pg)) { + auto *RetTy = cast<ScalableVectorType>(II.getType()); + Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(), + II.getArgOperand(2)); + return IC.replaceInstUsesWith(II, Splat); + } - const auto PTruePattern = - cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); - if (PTruePattern != AArch64SVEPredPattern::vl1) + if (!match(Pg, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( + m_SpecificInt(AArch64SVEPredPattern::vl1)))) return std::nullopt; - // The intrinsic is inserting into lane zero so use an insert instead. - auto *IdxTy = Type::getInt64Ty(II.getContext()); - auto *Insert = InsertElementInst::Create( - II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); - Insert->insertBefore(II.getIterator()); - Insert->takeName(&II); - + // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0 + Value *Insert = IC.Builder.CreateInsertElement( + II.getArgOperand(0), II.getArgOperand(2), uint64_t(0)); return IC.replaceInstUsesWith(II, Insert); } @@ -3007,9 +3049,9 @@ AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { llvm_unreachable("Unsupported register kind"); } -bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, - ArrayRef<const Value *> Args, - Type *SrcOverrideTy) const { +bool AArch64TTIImpl::isSingleExtWideningInstruction( + unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args, + Type *SrcOverrideTy) const { // A helper that returns a vector type from the given type. The number of // elements in type Ty determines the vector width. auto toVectorTy = [&](Type *ArgTy) { @@ -3027,48 +3069,29 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) return false; - // Determine if the operation has a widening variant. We consider both the - // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the - // instructions. - // - // TODO: Add additional widening operations (e.g., shl, etc.) once we - // verify that their extending operands are eliminated during code - // generation. Type *SrcTy = SrcOverrideTy; switch (Opcode) { - case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). - case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). + case Instruction::Add: // UADDW(2), SADDW(2). + case Instruction::Sub: { // USUBW(2), SSUBW(2). // The second operand needs to be an extend if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) { if (!SrcTy) SrcTy = toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType()); - } else + break; + } + + if (Opcode == Instruction::Sub) return false; - break; - case Instruction::Mul: { // SMULL(2), UMULL(2) - // Both operands need to be extends of the same type. - if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || - (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { + + // UADDW(2), SADDW(2) can be commutted. + if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) { if (!SrcTy) SrcTy = toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType()); - } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) { - // If one of the operands is a Zext and the other has enough zero bits to - // be treated as unsigned, we can still general a umull, meaning the zext - // is free. - KnownBits Known = - computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); - if (Args[0]->getType()->getScalarSizeInBits() - - Known.Zero.countLeadingOnes() > - DstTy->getScalarSizeInBits() / 2) - return false; - if (!SrcTy) - SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(), - DstTy->getScalarSizeInBits() / 2)); - } else - return false; - break; + break; + } + return false; } default: return false; @@ -3099,6 +3122,73 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize; } +Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy) const { + if (Opcode != Instruction::Add && Opcode != Instruction::Sub && + Opcode != Instruction::Mul) + return nullptr; + + // Exit early if DstTy is not a vector type whose elements are one of [i16, + // i32, i64]. SVE doesn't generally have the same set of instructions to + // perform an extend with the add/sub/mul. There are SMULLB style + // instructions, but they operate on top/bottom, requiring some sort of lane + // interleaving to be used with zext/sext. + unsigned DstEltSize = DstTy->getScalarSizeInBits(); + if (!useNeonVector(DstTy) || Args.size() != 2 || + (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) + return nullptr; + + auto getScalarSizeWithOverride = [&](const Value *V) { + if (SrcOverrideTy) + return SrcOverrideTy->getScalarSizeInBits(); + return cast<Instruction>(V) + ->getOperand(0) + ->getType() + ->getScalarSizeInBits(); + }; + + unsigned MaxEltSize = 0; + if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || + (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { + unsigned EltSize0 = getScalarSizeWithOverride(Args[0]); + unsigned EltSize1 = getScalarSizeWithOverride(Args[1]); + MaxEltSize = std::max(EltSize0, EltSize1); + } else if (isa<SExtInst, ZExtInst>(Args[0]) && + isa<SExtInst, ZExtInst>(Args[1])) { + unsigned EltSize0 = getScalarSizeWithOverride(Args[0]); + unsigned EltSize1 = getScalarSizeWithOverride(Args[1]); + // mul(sext, zext) will become smull(sext, zext) if the extends are large + // enough. + if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2) + return nullptr; + MaxEltSize = DstEltSize / 2; + } else if (Opcode == Instruction::Mul && + (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) { + // If one of the operands is a Zext and the other has enough zero bits + // to be treated as unsigned, we can still generate a umull, meaning the + // zext is free. + KnownBits Known = + computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); + if (Args[0]->getType()->getScalarSizeInBits() - + Known.Zero.countLeadingOnes() > + DstTy->getScalarSizeInBits() / 2) + return nullptr; + + MaxEltSize = + getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]); + } else + return nullptr; + + if (MaxEltSize * 2 > DstEltSize) + return nullptr; + + Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2); + if (ExtTy->getPrimitiveSizeInBits() <= 64) + return nullptr; + return ExtTy; +} + // s/urhadd instructions implement the following pattern, making the // extends free: // %x = add ((zext i8 -> i16), 1) @@ -3159,7 +3249,24 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, if (I && I->hasOneUser()) { auto *SingleUser = cast<Instruction>(*I->user_begin()); SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); - if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) { + if (Type *ExtTy = isBinExtWideningInstruction( + SingleUser->getOpcode(), Dst, Operands, + Src != I->getOperand(0)->getType() ? Src : nullptr)) { + // The cost from Src->Src*2 needs to be added if required, the cost from + // Src*2->ExtTy is free. + if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) { + Type *DoubleSrcTy = + Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2); + return getCastInstrCost(Opcode, DoubleSrcTy, Src, + TTI::CastContextHint::None, CostKind); + } + + return 0; + } + + if (isSingleExtWideningInstruction( + SingleUser->getOpcode(), Dst, Operands, + Src != I->getOperand(0)->getType() ? Src : nullptr)) { // For adds only count the second operand as free if both operands are // extends but not the same operation. (i.e both operands are not free in // add(sext, zext)). @@ -3168,8 +3275,11 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, (isa<CastInst>(SingleUser->getOperand(1)) && cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode)) return 0; - } else // Others are free so long as isWideningInstruction returned true. + } else { + // Others are free so long as isSingleExtWideningInstruction + // returned true. return 0; + } } // The cast will be free for the s/urhadd instructions @@ -4095,12 +4205,15 @@ InstructionCost AArch64TTIImpl::getScalarizationOverhead( std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost( Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, - TTI::OperandValueInfo Op2Info, bool IncludeTrunc, + TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function<InstructionCost(Type *)> InstCost) const { if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy()) return std::nullopt; if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16()) return std::nullopt; + if (CanUseSVE && Ty->isScalableTy() && ST->hasSVEB16B16() && + ST->isNonStreamingSVEorSME2Available()) + return std::nullopt; Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext())); InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty, @@ -4142,12 +4255,26 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( ISD == ISD::FDIV || ISD == ISD::FREM) if (auto PromotedCost = getFP16BF16PromoteCost( Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true, + // There is not native support for fdiv/frem even with +sve-b16b16. + /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM, [&](Type *PromotedTy) { return getArithmeticInstrCost(Opcode, PromotedTy, CostKind, Op1Info, Op2Info); })) return *PromotedCost; + // If the operation is a widening instruction (smull or umull) and both + // operands are extends the cost can be cheaper by considering that the + // operation will operate on the narrowest type size possible (double the + // largest input size) and a further extend. + if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) { + if (ExtTy != Ty) + return getArithmeticInstrCost(Opcode, ExtTy, CostKind) + + getCastInstrCost(Instruction::ZExt, Ty, ExtTy, + TTI::CastContextHint::None, CostKind); + return LT.first; + } + switch (ISD) { default: return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, @@ -4381,10 +4508,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( // - two 2-cost i64 inserts, and // - two 1-cost muls. // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with - // LT.first = 2 the cost is 28. If both operands are extensions it will not - // need to scalarize so the cost can be cheaper (smull or umull). - // so the cost can be cheaper (smull or umull). - if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) + // LT.first = 2 the cost is 28. + if (LT.second != MVT::v2i64) return LT.first; return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() * (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) + @@ -4546,7 +4671,8 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost( if (Opcode == Instruction::FCmp) { if (auto PromotedCost = getFP16BF16PromoteCost( ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false, - [&](Type *PromotedTy) { + // TODO: Consider costing SVE FCMPs. + /*CanUseSVE=*/false, [&](Type *PromotedTy) { InstructionCost Cost = getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, CostKind, Op1Info, Op2Info); @@ -4642,12 +4768,26 @@ bool AArch64TTIImpl::prefersVectorizedAddressing() const { } InstructionCost -AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, - Align Alignment, unsigned AddressSpace, +AArch64TTIImpl::getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, + TTI::TargetCostKind CostKind) const { + switch (MICA.getID()) { + case Intrinsic::masked_scatter: + case Intrinsic::masked_gather: + return getGatherScatterOpCost(MICA, CostKind); + case Intrinsic::masked_load: + case Intrinsic::masked_store: + return getMaskedMemoryOpCost(MICA, CostKind); + } + return BaseT::getMemIntrinsicInstrCost(MICA, CostKind); +} + +InstructionCost +AArch64TTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { + Type *Src = MICA.getDataType(); + if (useNeonVector(Src)) - return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind); + return BaseT::getMemIntrinsicInstrCost(MICA, CostKind); auto LT = getTypeLegalizationCost(Src); if (!LT.first.isValid()) return InstructionCost::getInvalid(); @@ -4689,12 +4829,21 @@ static unsigned getSVEGatherScatterOverhead(unsigned Opcode, } } -InstructionCost AArch64TTIImpl::getGatherScatterOpCost( - unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, - Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const { +InstructionCost +AArch64TTIImpl::getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, + TTI::TargetCostKind CostKind) const { + + unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather || + MICA.getID() == Intrinsic::vp_gather) + ? Instruction::Load + : Instruction::Store; + + Type *DataTy = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + const Instruction *I = MICA.getInst(); + if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy)) - return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, - Alignment, CostKind, I); + return BaseT::getMemIntrinsicInstrCost(MICA, CostKind); auto *VT = cast<VectorType>(DataTy); auto LT = getTypeLegalizationCost(DataTy); if (!LT.first.isValid()) @@ -5172,6 +5321,7 @@ void AArch64TTIImpl::getUnrollingPreferences( // inlining. Don't unroll auto-vectorized loops either, though do allow // unrolling of the scalar remainder. bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized"); + InstructionCost Cost = 0; for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { // Both auto-vectorized loops and the scalar remainder have the @@ -5186,24 +5336,19 @@ void AArch64TTIImpl::getUnrollingPreferences( continue; return; } + + SmallVector<const Value *, 4> Operands(I.operand_values()); + Cost += getInstructionCost(&I, Operands, + TargetTransformInfo::TCK_SizeAndLatency); } } // Apply subtarget-specific unrolling preferences. - switch (ST->getProcFamily()) { - case AArch64Subtarget::AppleA14: - case AArch64Subtarget::AppleA15: - case AArch64Subtarget::AppleA16: - case AArch64Subtarget::AppleM4: + if (ST->isAppleMLike()) getAppleRuntimeUnrollPreferences(L, SE, UP, *this); - break; - case AArch64Subtarget::Falkor: - if (EnableFalkorHWPFUnrollFix) - getFalkorUnrollingPreferences(L, SE, UP); - break; - default: - break; - } + else if (ST->getProcFamily() == AArch64Subtarget::Falkor && + EnableFalkorHWPFUnrollFix) + getFalkorUnrollingPreferences(L, SE, UP); // If this is a small, multi-exit loop similar to something like std::find, // then there is typically a performance improvement achieved by unrolling. @@ -5232,6 +5377,11 @@ void AArch64TTIImpl::getUnrollingPreferences( UP.UnrollAndJam = true; UP.UnrollAndJamInnerLoopThreshold = 60; } + + // Force unrolling small loops can be very useful because of the branch + // taken cost of the backedge. + if (Cost < Aarch64ForceUnrollThreshold) + UP.Force = true; } void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, @@ -5902,6 +6052,15 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, SrcTy = DstTy; } + // Check for identity masks, which we can treat as free for both fixed and + // scalable vector paths. + if (!Mask.empty() && LT.second.isFixedLengthVector() && + (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && + all_of(enumerate(Mask), [](const auto &M) { + return M.value() < 0 || M.value() == (int)M.index(); + })) + return 0; + // Segmented shuffle matching. if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) && !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() && @@ -5949,21 +6108,13 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, all_of(Mask, [](int E) { return E < 8; })) return getPerfectShuffleCost(Mask); - // Check for identity masks, which we can treat as free. - if (!Mask.empty() && LT.second.isFixedLengthVector() && - (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && - all_of(enumerate(Mask), [](const auto &M) { - return M.value() < 0 || M.value() == (int)M.index(); - })) - return 0; - // Check for other shuffles that are not SK_ kinds but we have native // instructions for, for example ZIP and UZP. unsigned Unused; if (LT.second.isFixedLengthVector() && LT.second.getVectorNumElements() == Mask.size() && (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && - (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) || + (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) || isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) || isREVMask(Mask, LT.second.getScalarSizeInBits(), LT.second.getVectorNumElements(), 16) || @@ -6129,7 +6280,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, } static bool containsDecreasingPointers(Loop *TheLoop, - PredicatedScalarEvolution *PSE) { + PredicatedScalarEvolution *PSE, + const DominatorTree &DT) { const auto &Strides = DenseMap<Value *, const SCEV *>(); for (BasicBlock *BB : TheLoop->blocks()) { // Scan the instructions in the block and look for addresses that are @@ -6138,8 +6290,8 @@ static bool containsDecreasingPointers(Loop *TheLoop, if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) { Value *Ptr = getLoadStorePointerOperand(&I); Type *AccessTy = getLoadStoreType(&I); - if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true, - /*ShouldCheckWrap=*/false) + if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides, + /*Assume=*/true, /*ShouldCheckWrap=*/false) .value_or(0) < 0) return true; } @@ -6184,7 +6336,8 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { // negative strides. This will require extra work to reverse the loop // predicate, which may be expensive. if (containsDecreasingPointers(TFI->LVL->getLoop(), - TFI->LVL->getPredicatedScalarEvolution())) + TFI->LVL->getPredicatedScalarEvolution(), + *TFI->LVL->getDominatorTree())) Required |= TailFoldingOpts::Reverse; if (Required == TailFoldingOpts::Disabled) Required |= TailFoldingOpts::Simple; |
