aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp')
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp90
1 files changed, 63 insertions, 27 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e3370d3..fede586 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -224,7 +224,8 @@ static cl::opt<bool> EnableScalableAutovecInStreamingMode(
static bool isSMEABIRoutineCall(const CallInst &CI,
const AArch64TargetLowering &TLI) {
const auto *F = CI.getCalledFunction();
- return F && SMEAttrs(F->getName(), TLI).isSMEABIRoutine();
+ return F &&
+ SMEAttrs(F->getName(), TLI.getRuntimeLibcallsInfo()).isSMEABIRoutine();
}
/// Returns true if the function has explicit operations that can only be
@@ -355,7 +356,7 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
// change only once and avoid inlining of G into F.
SMEAttrs FAttrs(*F);
- SMECallAttrs CallAttrs(Call, getTLI());
+ SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
if (F == Call.getCaller()) // (1)
@@ -957,23 +958,50 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return TyL.first + ExtraCost;
}
case Intrinsic::get_active_lane_mask: {
- auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
- if (RetTy) {
- EVT RetVT = getTLI()->getValueType(DL, RetTy);
- EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
- if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
- !getTLI()->isTypeLegal(RetVT)) {
- // We don't have enough context at this point to determine if the mask
- // is going to be kept live after the block, which will force the vXi1
- // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
- // For now, we just assume the vectorizer created this intrinsic and
- // the result will be the input for a PHI. In this case the cost will
- // be extremely high for fixed-width vectors.
- // NOTE: getScalarizationOverhead returns a cost that's far too
- // pessimistic for the actual generated codegen. In reality there are
- // two instructions generated per lane.
- return RetTy->getNumElements() * 2;
+ auto RetTy = cast<VectorType>(ICA.getReturnType());
+ EVT RetVT = getTLI()->getValueType(DL, RetTy);
+ EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+ if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
+ break;
+
+ if (RetTy->isScalableTy()) {
+ if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
+ TargetLowering::TypeSplitVector)
+ break;
+
+ auto LT = getTypeLegalizationCost(RetTy);
+ InstructionCost Cost = LT.first;
+ // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
+ // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
+ // nxv32i1 = get_active_lane_mask(base, idx) ->
+ // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
+ if (ST->hasSVE2p1() || ST->hasSME2()) {
+ Cost /= 2;
+ if (Cost == 1)
+ return Cost;
}
+
+ // If more than one whilelo intrinsic is required, include the extra cost
+ // required by the saturating add & select required to increment the
+ // start value after the first intrinsic call.
+ Type *OpTy = ICA.getArgTypes()[0];
+ IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
+ InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
+ Type *CondTy = OpTy->getWithNewBitWidth(1);
+ SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
+ CmpInst::ICMP_UGT, CostKind);
+ return Cost + (SplitCost * (Cost - 1));
+ } else if (!getTLI()->isTypeLegal(RetVT)) {
+ // We don't have enough context at this point to determine if the mask
+ // is going to be kept live after the block, which will force the vXi1
+ // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
+ // For now, we just assume the vectorizer created this intrinsic and
+ // the result will be the input for a PHI. In this case the cost will
+ // be extremely high for fixed-width vectors.
+ // NOTE: getScalarizationOverhead returns a cost that's far too
+ // pessimistic for the actual generated codegen. In reality there are
+ // two instructions generated per lane.
+ return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
}
break;
}
@@ -1577,18 +1605,26 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
}
static bool isAllActivePredicate(Value *Pred) {
- // Look through convert.from.svbool(convert.to.svbool(...) chain.
Value *UncastedPred;
+
+ // Look through predicate casts that only remove lanes.
if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
- m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
- m_Value(UncastedPred)))))
- // If the predicate has the same or less lanes than the uncasted
- // predicate then we know the casting has no effect.
- if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
- cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
- Pred = UncastedPred;
+ m_Value(UncastedPred)))) {
+ auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
+ Pred = UncastedPred;
+
+ if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
+ m_Value(UncastedPred))))
+ // If the predicate has the same or less lanes than the uncasted predicate
+ // then we know the casting has no effect.
+ if (OrigPredTy->getMinNumElements() <=
+ cast<ScalableVectorType>(UncastedPred->getType())
+ ->getMinNumElements())
+ Pred = UncastedPred;
+ }
+
auto *C = dyn_cast<Constant>(Pred);
- return (C && C->isAllOnesValue());
+ return C && C->isAllOnesValue();
}
// Simplify `V` by only considering the operations that affect active lanes.