From 37ae4ad0eef338776c7e2cffb3896153d43dcd90 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 29 Apr 2024 09:57:37 -0400 Subject: [SLP]Support minbitwidth analisys for buildvector nodes. Metric: size..text Program size..text exp ref diff test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 42906.00 42986.00 0.2% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 42909.00 42989.00 0.2% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 664581.00 664661.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 664581.00 664661.00 0.0% Less is better. Replaces `buildvector

+ trunc

to

` sequences to `buildvector

of { trunc in to im }` scalars, which is free in most cases, results in better code. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/88504 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 328 +++++++++++++-------- .../gather-buildvector-with-minbitwidth-user.ll | 7 +- .../AArch64/gather-with-minbith-user.ll | 3 +- .../AArch64/user-node-not-in-bitwidths.ll | 7 +- .../SystemZ/minbitwidth-root-trunc.ll | 6 +- .../X86/minbitwidth-node-with-multi-users.ll | 3 +- 6 files changed, 217 insertions(+), 137 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 2facd03..e3a1b0d 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2487,12 +2487,12 @@ private: /// which exploits values reused across lanes, and arranges the inserts /// for ease of later optimization. template - ResTy processBuildVector(const TreeEntry *E, Args &...Params); + ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params); /// Create a new vector from a list of scalar values. Produces a sequence /// which exploits values reused across lanes, and arranges the inserts /// for ease of later optimization. - Value *createBuildVector(const TreeEntry *E); + Value *createBuildVector(const TreeEntry *E, Type *ScalarTy); /// Returns the instruction in the bundle, which can be used as a base point /// for scheduling. Usually it is the last instruction in the bundle, except @@ -2556,7 +2556,8 @@ private: /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. /// \param ForPoisonSrc true if initial vector is poison, false otherwise. - InstructionCost getGatherCost(ArrayRef VL, bool ForPoisonSrc) const; + InstructionCost getGatherCost(ArrayRef VL, bool ForPoisonSrc, + Type *ScalarTy) const; /// Set the Builder insert point to one after the last instruction in /// the bundle @@ -2564,7 +2565,7 @@ private: /// \returns a vector from a collection of scalars in \p VL. if \p Root is not /// specified, the starting vector value is poison. - Value *gather(ArrayRef VL, Value *Root); + Value *gather(ArrayRef VL, Value *Root, Type *ScalarTy); /// \returns whether the VectorizableTree is fully vectorizable and will /// be beneficial even the tree height is tiny. @@ -7876,6 +7877,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { bool IsFinalized = false; SmallVector CommonMask; SmallVector, 2> InVectors; + Type *ScalarTy = nullptr; const TargetTransformInfo &TTI; InstructionCost Cost = 0; SmallDenseSet VectorizedVals; @@ -7905,13 +7907,13 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { InstructionCost getBuildVectorCost(ArrayRef VL, Value *Root) { if ((!Root && allConstant(VL)) || all_of(VL, IsaPred)) return TTI::TCC_Free; - auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); + auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); InstructionCost GatherCost = 0; SmallVector Gathers(VL.begin(), VL.end()); // Improve gather cost for gather of loads, if we can group some of the // loads into vector loads. InstructionsState S = getSameOpcode(VL, *R.TLI); - const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType()); + const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy); unsigned MinVF = R.getMinVF(2 * Sz); if (VL.size() > 2 && ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) || @@ -7925,7 +7927,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { }))) && !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) && !isSplat(Gathers)) { - InstructionCost BaseCost = R.getGatherCost(Gathers, !Root); + InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy); SetVector VectorizedLoads; SmallVector> VectorizedStarts; SmallVector ScatterVectorized; @@ -8053,7 +8055,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { VecTy, Mask, CostKind); } } else { - GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true); + GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true, + PointerOps.front()->getType()); } } if (NeedInsertSubvectorAnalysis) { @@ -8087,18 +8090,19 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { transform(VL, ShuffleMask.begin(), [](Value *V) { return isa(V) ? PoisonMaskElem : 0; }); - InstructionCost InsertCost = TTI.getVectorInstrCost( - Instruction::InsertElement, VecTy, CostKind, 0, - PoisonValue::get(VecTy), *It); - return InsertCost + - TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, - ShuffleMask, CostKind, /*Index=*/0, - /*SubTp=*/nullptr, /*Args=*/*It); + InstructionCost InsertCost = + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0, + PoisonValue::get(VecTy), *It); + return InsertCost + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, + VecTy, ShuffleMask, CostKind, + /*Index=*/0, /*SubTp=*/nullptr, + /*Args=*/*It); } return GatherCost + (all_of(Gathers, IsaPred) ? TTI::TCC_Free - : R.getGatherCost(Gathers, !Root && VL.equals(Gathers))); + : R.getGatherCost(Gathers, !Root && VL.equals(Gathers), + ScalarTy)); }; /// Compute the cost of creating a vector containing the extracted values from @@ -8118,8 +8122,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { return Sz; return std::max(Sz, VecTy->getNumElements()); }); - unsigned NumSrcRegs = TTI.getNumberOfParts( - FixedVectorType::get(VL.front()->getType(), NumElts)); + unsigned NumSrcRegs = + TTI.getNumberOfParts(FixedVectorType::get(ScalarTy, NumElts)); if (NumSrcRegs == 0) NumSrcRegs = 1; // FIXME: this must be moved to TTI for better estimation. @@ -8165,17 +8169,16 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { std::optional RegShuffleKind = CheckPerRegistersShuffle(SubMask); if (!RegShuffleKind) { - Cost += ::getShuffleCost( - TTI, *ShuffleKinds[Part], - FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice); + Cost += ::getShuffleCost(TTI, *ShuffleKinds[Part], + FixedVectorType::get(ScalarTy, NumElts), + MaskSlice); continue; } if (*RegShuffleKind != TTI::SK_PermuteSingleSrc || !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) { - Cost += ::getShuffleCost( - TTI, *RegShuffleKind, - FixedVectorType::get(VL.front()->getType(), EltsPerVector), - SubMask); + Cost += ::getShuffleCost(TTI, *RegShuffleKind, + FixedVectorType::get(ScalarTy, EltsPerVector), + SubMask); } } return Cost; @@ -8292,6 +8295,48 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { SmallVector CommonMask(Mask.begin(), Mask.end()); Value *V1 = P1.dyn_cast(), *V2 = P2.dyn_cast(); unsigned CommonVF = Mask.size(); + InstructionCost ExtraCost = 0; + auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E, + unsigned VF) -> InstructionCost { + if (E.State == TreeEntry::NeedToGather && allConstant(E.Scalars)) + return TTI::TCC_Free; + Type *EScalarTy = E.Scalars.front()->getType(); + bool IsSigned = true; + if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) { + EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first); + IsSigned = It->second.second; + } + if (EScalarTy != ScalarTy) { + unsigned CastOpcode = Instruction::Trunc; + unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy); + unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy); + if (DstSz > SrcSz) + CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt; + return TTI.getCastInstrCost(CastOpcode, + FixedVectorType::get(ScalarTy, VF), + FixedVectorType::get(EScalarTy, VF), + TTI::CastContextHint::None, CostKind); + } + return TTI::TCC_Free; + }; + auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost { + if (isa(V)) + return TTI::TCC_Free; + auto *VecTy = cast(V->getType()); + Type *EScalarTy = VecTy->getElementType(); + if (EScalarTy != ScalarTy) { + bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL)); + unsigned CastOpcode = Instruction::Trunc; + unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy); + unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy); + if (DstSz > SrcSz) + CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt; + return TTI.getCastInstrCost( + CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()), + VecTy, TTI::CastContextHint::None, CostKind); + } + return TTI::TCC_Free; + }; if (!V1 && !V2 && !P2.isNull()) { // Shuffle 2 entry nodes. const TreeEntry *E = P1.get(); @@ -8318,11 +8363,14 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } } CommonVF = E->Scalars.size(); + ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) + + GetNodeMinBWAffectedCost(*E2, CommonVF); + } else { + ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) + + GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor()); } - V1 = Constant::getNullValue( - FixedVectorType::get(E->Scalars.front()->getType(), CommonVF)); - V2 = getAllOnesValue( - *R.DL, FixedVectorType::get(E->Scalars.front()->getType(), CommonVF)); + V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF)); + V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF)); } else if (!V1 && P2.isNull()) { // Shuffle single entry node. const TreeEntry *E = P1.get(); @@ -8341,8 +8389,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } CommonVF = E->Scalars.size(); } - V1 = Constant::getNullValue( - FixedVectorType::get(E->Scalars.front()->getType(), CommonVF)); + ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF); + V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF)); // Not identity/broadcast? Try to see if the original vector is better. if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() && CommonVF == CommonMask.size() && @@ -8359,6 +8407,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } } else if (V1 && P2.isNull()) { // Shuffle single vector. + ExtraCost += GetValueMinBWAffectedCost(V1); CommonVF = cast(V1->getType())->getNumElements(); assert( all_of(Mask, @@ -8385,11 +8434,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } CommonVF = VF; } - V1 = Constant::getNullValue( - FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF)); - V2 = getAllOnesValue( - *R.DL, - FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF)); + ExtraCost += GetValueMinBWAffectedCost(V1); + V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF)); + ExtraCost += GetNodeMinBWAffectedCost( + *E2, std::min(CommonVF, E2->getVectorFactor())); + V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF)); } else if (!V1 && V2) { // Shuffle vector and tree node. unsigned VF = cast(V2->getType())->getNumElements(); @@ -8413,11 +8462,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } CommonVF = VF; } - V1 = Constant::getNullValue( - FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF)); - V2 = getAllOnesValue( - *R.DL, - FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF)); + ExtraCost += GetNodeMinBWAffectedCost( + *E1, std::min(CommonVF, E1->getVectorFactor())); + V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF)); + ExtraCost += GetValueMinBWAffectedCost(V2); + V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF)); } else { assert(V1 && V2 && "Expected both vectors."); unsigned VF = cast(V1->getType())->getNumElements(); @@ -8428,30 +8477,33 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { return Idx < 2 * static_cast(CommonVF); }) && "All elements in mask must be less than 2 * CommonVF."); + ExtraCost += + GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2); if (V1->getType() != V2->getType()) { - V1 = Constant::getNullValue(FixedVectorType::get( - cast(V1->getType())->getElementType(), CommonVF)); - V2 = getAllOnesValue( - *R.DL, FixedVectorType::get( - cast(V1->getType())->getElementType(), - CommonVF)); + V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF)); + V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF)); + } else { + if (cast(V1->getType())->getElementType() != ScalarTy) + V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF)); + if (cast(V2->getType())->getElementType() != ScalarTy) + V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF)); } } - InVectors.front() = Constant::getNullValue(FixedVectorType::get( - cast(V1->getType())->getElementType(), - CommonMask.size())); + InVectors.front() = Constant::getNullValue( + FixedVectorType::get(ScalarTy, CommonMask.size())); if (InVectors.size() == 2) InVectors.pop_back(); - return BaseShuffleAnalysis::createShuffle( - V1, V2, CommonMask, Builder); + return ExtraCost + BaseShuffleAnalysis::createShuffle( + V1, V2, CommonMask, Builder); } public: - ShuffleCostEstimator(TargetTransformInfo &TTI, + ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef VectorizedVals, BoUpSLP &R, SmallPtrSetImpl &CheckedExtracts) - : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), - R(R), CheckedExtracts(CheckedExtracts) {} + : ScalarTy(ScalarTy), TTI(TTI), + VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R), + CheckedExtracts(CheckedExtracts) {} Value *adjustExtracts(const TreeEntry *E, MutableArrayRef Mask, ArrayRef> ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput) { @@ -8547,7 +8599,7 @@ public: if (NumParts != 1 && UniqueBases.size() != 1) { UseVecBaseAsInput = true; VecBase = Constant::getNullValue( - FixedVectorType::get(VL.front()->getType(), CommonMask.size())); + FixedVectorType::get(ScalarTy, CommonMask.size())); } return VecBase; } @@ -8575,8 +8627,7 @@ public: return; } assert(!CommonMask.empty() && "Expected non-empty common mask."); - auto *MaskVecTy = - FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size()); + auto *MaskVecTy = FixedVectorType::get(ScalarTy, Mask.size()); unsigned NumParts = TTI.getNumberOfParts(MaskVecTy); if (NumParts == 0 || NumParts >= Mask.size()) NumParts = 1; @@ -8593,8 +8644,7 @@ public: return; } assert(!CommonMask.empty() && "Expected non-empty common mask."); - auto *MaskVecTy = - FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size()); + auto *MaskVecTy = FixedVectorType::get(ScalarTy, Mask.size()); unsigned NumParts = TTI.getNumberOfParts(MaskVecTy); if (NumParts == 0 || NumParts >= Mask.size()) NumParts = 1; @@ -8694,7 +8744,7 @@ public: return ConstantVector::getSplat( ElementCount::getFixed( cast(Root->getType())->getNumElements()), - getAllOnesValue(*R.DL, VL.front()->getType())); + getAllOnesValue(*R.DL, ScalarTy)); } InstructionCost createFreeze(InstructionCost Cost) { return Cost; } /// Finalize emission of the shuffles. @@ -8840,7 +8890,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, if (isa(VL[0])) return InstructionCost::getInvalid(); return processBuildVector( - E, *TTI, VectorizedVals, *this, CheckedExtracts); + E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts); } InstructionCost CommonCost = 0; SmallVector Mask; @@ -10880,12 +10930,8 @@ BoUpSLP::isGatherShuffledEntry( return Res; } -InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, - bool ForPoisonSrc) const { - // Find the type of the operands in VL. - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); +InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, bool ForPoisonSrc, + Type *ScalarTy) const { auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); bool DuplicateNonConst = false; // Find the cost of inserting/extracting values from the vector. @@ -10896,6 +10942,11 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Cost; auto EstimateInsertCost = [&](unsigned I, Value *V) { + if (V->getType() != ScalarTy) { + Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(), + TTI::CastContextHint::None, CostKind); + V = nullptr; + } if (!ForPoisonSrc) Cost += TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, @@ -11123,7 +11174,7 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { Builder.SetCurrentDebugLocation(Front->getDebugLoc()); } -Value *BoUpSLP::gather(ArrayRef VL, Value *Root) { +Value *BoUpSLP::gather(ArrayRef VL, Value *Root, Type *ScalarTy) { // List of instructions/lanes from current block and/or the blocks which are // part of the current loop. These instructions will be inserted at the end to // make it possible to optimize loops and hoist invariant instructions out of @@ -11149,14 +11200,11 @@ Value *BoUpSLP::gather(ArrayRef VL, Value *Root) { auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos, Type *Ty) { Value *Scalar = V; - if (cast(Vec->getType())->getElementType() != Ty) { - assert(V->getType()->isIntegerTy() && Ty->isIntegerTy() && + if (Scalar->getType() != Ty) { + assert(Scalar->getType()->isIntegerTy() && Ty->isIntegerTy() && "Expected integer types only."); - Vec = Builder.CreateIntCast( - Vec, - VectorType::get(Ty, - cast(Vec->getType())->getElementCount()), - !isKnownNonNegative(Vec, SimplifyQuery(*DL))); + Scalar = Builder.CreateIntCast( + Scalar, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL))); } Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos)); @@ -11184,10 +11232,7 @@ Value *BoUpSLP::gather(ArrayRef VL, Value *Root) { } return Vec; }; - Value *Val0 = - isa(VL[0]) ? cast(VL[0])->getValueOperand() : VL[0]; - Type *ScalarTy = Val0->getType(); - FixedVectorType *VecTy = FixedVectorType::get(ScalarTy, VL.size()); + auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); Value *Vec = Root ? Root : PoisonValue::get(VecTy); SmallVector NonConsts; // Insert constant values at first. @@ -11266,6 +11311,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { /// resulting shuffle and the second operand sets to be the newly added /// operand. The \p CommonMask is transformed in the proper way after that. SmallVector InVectors; + Type *ScalarTy = nullptr; IRBuilderBase &Builder; BoUpSLP &R; @@ -11376,9 +11422,20 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { CommonMask[Idx] = Idx; } + /// Cast value \p V to the vector type with the same number of elements, but + /// the base type \p ScalarTy. + Value *castToScalarTyElem(Value *V) { + auto *VecTy = cast(V->getType()); + if (VecTy->getElementType() == ScalarTy) + return V; + return Builder.CreateIntCast( + V, VectorType::get(ScalarTy, VecTy->getElementCount()), + !isKnownNonNegative(V, SimplifyQuery(*R.DL))); + } + public: - ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R) - : Builder(Builder), R(R) {} + ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R) + : ScalarTy(ScalarTy), Builder(Builder), R(R) {} /// Adjusts extractelements after reusing them. Value *adjustExtracts(const TreeEntry *E, MutableArrayRef Mask, @@ -11417,8 +11474,10 @@ public: continue; R.eraseInstruction(EI); } - if (NumParts == 1 || UniqueBases.size() == 1) + if (NumParts == 1 || UniqueBases.size() == 1) { + VecBase = castToScalarTyElem(VecBase); return VecBase; + } UseVecBaseAsInput = true; auto TransformToIdentity = [](MutableArrayRef Mask) { for (auto [I, Idx] : enumerate(Mask)) @@ -11455,6 +11514,7 @@ public: "Expected vectors of the same size."); PrevSize = Size; #endif // NDEBUG + VecOp = castToScalarTyElem(VecOp); Bases[SubMask[I] < Size ? 0 : 1] = VecOp; } if (!Bases.front()) @@ -11510,10 +11570,10 @@ public: return std::nullopt; // Postpone gather emission, will be emitted after the end of the // process to keep correct order. - auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(), - E->getVectorFactor()); + auto *ResVecTy = FixedVectorType::get(ScalarTy, E->getVectorFactor()); return Builder.CreateAlignedLoad( - VecTy, PoisonValue::get(PointerType::getUnqual(VecTy->getContext())), + ResVecTy, + PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())), MaybeAlign()); } /// Adds 2 input vectors (in form of tree entries) and the mask for their @@ -11529,6 +11589,8 @@ public: /// Adds 2 input vectors and the mask for their shuffling. void add(Value *V1, Value *V2, ArrayRef Mask) { assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors."); + V1 = castToScalarTyElem(V1); + V2 = castToScalarTyElem(V2); if (InVectors.empty()) { InVectors.push_back(V1); InVectors.push_back(V2); @@ -11556,6 +11618,7 @@ public: } /// Adds another one input vector and the mask for the shuffling. void add(Value *V1, ArrayRef Mask, bool = false) { + V1 = castToScalarTyElem(V1); if (InVectors.empty()) { if (!isa(V1->getType())) { V1 = createShuffle(V1, nullptr, CommonMask); @@ -11619,7 +11682,7 @@ public: } Value *gather(ArrayRef VL, unsigned MaskVF = 0, Value *Root = nullptr) { - return R.gather(VL, Root); + return R.gather(VL, Root, ScalarTy); } Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); } /// Finalize emission of the shuffles. @@ -11719,7 +11782,8 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, } if (IsSameVE) { auto FinalShuffle = [&](Value *V, ArrayRef Mask) { - ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); + ShuffleInstructionBuilder ShuffleBuilder( + cast(V->getType())->getElementType(), Builder, *this); ShuffleBuilder.add(V, Mask); return ShuffleBuilder.finalize(std::nullopt); }; @@ -11794,7 +11858,8 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, } template -ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { +ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, + Args &...Params) { assert(E->State == TreeEntry::NeedToGather && "Expected gather node."); unsigned VF = E->getVectorFactor(); @@ -11842,7 +11907,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { } return true; }; - BVTy ShuffleBuilder(Params...); + BVTy ShuffleBuilder(ScalarTy, Params...); ResTy Res = ResTy(); SmallVector Mask; SmallVector ExtractMask(GatheredScalars.size(), PoisonMaskElem); @@ -11851,7 +11916,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { bool UseVecBaseAsInput = false; SmallVector> GatherShuffles; SmallVector> Entries; - Type *ScalarTy = GatheredScalars.front()->getType(); + Type *OrigScalarTy = GatheredScalars.front()->getType(); auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size()); unsigned NumParts = TTI->getNumberOfParts(VecTy); if (NumParts == 0 || NumParts >= GatheredScalars.size()) @@ -11886,7 +11951,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { GatheredScalars.size() != VF) { Resized = true; GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); + PoisonValue::get(OrigScalarTy)); } } } @@ -11946,12 +12011,12 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { }); })) GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); + PoisonValue::get(OrigScalarTy)); } // Remove shuffled elements from list of gathers. for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { if (Mask[I] != PoisonMaskElem) - GatheredScalars[I] = PoisonValue::get(ScalarTy); + GatheredScalars[I] = PoisonValue::get(OrigScalarTy); } } } @@ -11962,7 +12027,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { // such sequences. bool IsSplat = IsRootPoison && isSplat(Scalars) && (Scalars.size() > 2 || Scalars.front() == Scalars.back()); - Scalars.append(VF - Scalars.size(), PoisonValue::get(ScalarTy)); + Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy)); SmallVector UndefPos; DenseMap UniquePositions; // Gather unique non-const values and all constant values. @@ -11984,7 +12049,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { ++NumNonConsts; SinglePos = I; Value *OrigV = V; - Scalars[I] = PoisonValue::get(ScalarTy); + Scalars[I] = PoisonValue::get(OrigScalarTy); if (IsSplat) { Scalars.front() = OrigV; ReuseMask[I] = 0; @@ -12000,7 +12065,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { ReuseMask.assign(VF, PoisonMaskElem); std::swap(Scalars.front(), Scalars[SinglePos]); if (!UndefPos.empty() && UndefPos.front() == 0) - Scalars.front() = UndefValue::get(ScalarTy); + Scalars.front() = UndefValue::get(OrigScalarTy); } ReuseMask[SinglePos] = SinglePos; } else if (!UndefPos.empty() && IsSplat) { @@ -12030,7 +12095,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { // Replace the undef by the poison, in the mask it is replaced by // non-poisoned scalar already. if (I != Pos) - Scalars[I] = PoisonValue::get(ScalarTy); + Scalars[I] = PoisonValue::get(OrigScalarTy); } } else { // Replace undefs by the poisons, emit broadcast and then emit @@ -12038,7 +12103,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { for (int I : UndefPos) { ReuseMask[I] = PoisonMaskElem; if (isa(Scalars[I])) - Scalars[I] = PoisonValue::get(ScalarTy); + Scalars[I] = PoisonValue::get(OrigScalarTy); } NeedFreeze = true; } @@ -12093,9 +12158,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1); } else { IsUsedInExpr = false; - ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get( - ScalarTy, GatheredScalars.size())), - ExtractMask, /*ForExtracts=*/true); + ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask, + /*ForExtracts=*/true); } } if (!GatherShuffles.empty()) { @@ -12176,9 +12240,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { // contains only constant to build final vector and then shuffle. for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) { if (EnoughConstsForShuffle && isa(GatheredScalars[I])) - NonConstants[I] = PoisonValue::get(ScalarTy); + NonConstants[I] = PoisonValue::get(OrigScalarTy); else - GatheredScalars[I] = PoisonValue::get(ScalarTy); + GatheredScalars[I] = PoisonValue::get(OrigScalarTy); } // Generate constants for final shuffle and build a mask for them. if (!all_of(GatheredScalars, IsaPred)) { @@ -12224,9 +12288,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { return Res; } -Value *BoUpSLP::createBuildVector(const TreeEntry *E) { - return processBuildVector(E, Builder, - *this); +Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) { + return processBuildVector(E, ScalarTy, + Builder, *this); } Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { @@ -12239,18 +12303,28 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { return E->VectorizedValue; } + Value *V = E->Scalars.front(); + Type *ScalarTy = V->getType(); + if (auto *Store = dyn_cast(V)) + ScalarTy = Store->getValueOperand()->getType(); + else if (auto *IE = dyn_cast(V)) + ScalarTy = IE->getOperand(1)->getType(); + auto It = MinBWs.find(E); + if (It != MinBWs.end()) + ScalarTy = IntegerType::get(F->getContext(), It->second.first); + auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); if (E->State == TreeEntry::NeedToGather) { // Set insert point for non-reduction initial nodes. if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList) setInsertPointAfterBundle(E); - Value *Vec = createBuildVector(E); + Value *Vec = createBuildVector(E, ScalarTy); E->VectorizedValue = Vec; return Vec; } bool IsReverseOrder = isReverseOrder(E->ReorderIndices); auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) { - ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); + ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this); if (E->getOpcode() == Instruction::Store) { ArrayRef Mask = ArrayRef(reinterpret_cast(E->ReorderIndices.begin()), @@ -12271,14 +12345,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); Instruction *VL0 = E->getMainOp(); - Type *ScalarTy = VL0->getType(); - if (auto *Store = dyn_cast(VL0)) - ScalarTy = Store->getValueOperand()->getType(); - else if (auto *IE = dyn_cast(VL0)) - ScalarTy = IE->getOperand(1)->getType(); - auto It = MinBWs.find(E); - if (It != MinBWs.end()) - ScalarTy = IntegerType::get(F->getContext(), It->second.first); auto GetOperandSignedness = [&](unsigned Idx) { const TreeEntry *OpE = getOperandEntry(E, Idx); bool IsSigned = false; @@ -12291,7 +12357,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { }); return IsSigned; }; - auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); switch (ShuffleOrOp) { case Instruction::PHI: { assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() || @@ -13545,7 +13610,8 @@ Value *BoUpSLP::vectorizeTree( else CombinedMask2[I] = Mask[I] - VF; } - ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); + ShuffleInstructionBuilder ShuffleBuilder( + cast(V1->getType())->getElementType(), Builder, *this); ShuffleBuilder.add(V1, CombinedMask1); if (V2) ShuffleBuilder.add(V2, CombinedMask2); @@ -14580,13 +14646,27 @@ bool BoUpSLP::collectValuesToDemote( return false; bool Res = all_of( E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth))); - // Gather demoted constant operands. - if (Res && E.State == TreeEntry::NeedToGather && - all_of(E.Scalars, IsaPred)) - ToDemote.push_back(E.Idx); + // Demote gathers. + if (Res && E.State == TreeEntry::NeedToGather) { + // Check possible extractelement instructions bases and final vector + // length. + SmallPtrSet UniqueBases; + for (Value *V : E.Scalars) { + auto *EE = dyn_cast(V); + if (!EE) + continue; + UniqueBases.insert(EE->getVectorOperand()); + } + const unsigned VF = E.Scalars.size(); + Type *OrigScalarTy = E.Scalars.front()->getType(); + if (UniqueBases.size() <= 2 || + TTI->getNumberOfParts(FixedVectorType::get(OrigScalarTy, VF)) == + TTI->getNumberOfParts(FixedVectorType::get( + IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF))) + ToDemote.push_back(E.Idx); + } return Res; }; - // TODO: improve handling of gathered values and others. if (E.State == TreeEntry::NeedToGather || !Visited.insert(&E).second || any_of(E.Scalars, [&](Value *V) { return all_of(V->users(), [&](User *U) { diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll index 6907724..3771ec4 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll @@ -5,12 +5,7 @@ define void @h() { ; CHECK-LABEL: define void @h() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 0, i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = trunc <8 x i32> [[TMP0]] to <8 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i1> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i1> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i1> [[TMP4]] to <8 x i16> -; CHECK-NEXT: store <8 x i16> [[TMP3]], ptr [[ARRAYIDX2]], align 2 +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr [[ARRAYIDX2]], align 2 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll index d51ef0b..76bb882 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll @@ -5,7 +5,8 @@ define void @h() { ; CHECK-LABEL: define void @h() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16 -; CHECK-NEXT: [[TMP0:%.*]] = trunc <8 x i32> zeroinitializer to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 0 to i1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i1> , i1 [[TMP6]], i32 4 ; CHECK-NEXT: [[TMP1:%.*]] = sub <8 x i1> [[TMP0]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i1> [[TMP0]], zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll index 6404cf4..2ab6e91 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll @@ -5,7 +5,12 @@ define void @h() { ; CHECK-LABEL: define void @h() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16 -; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 0 to i1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i1> , i1 [[TMP0]], i32 4 +; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i1> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i1> zeroinitializer, [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i1> [[TMP3]] to <8 x i16> +; CHECK-NEXT: store <8 x i16> [[TMP4]], ptr [[ARRAYIDX2]], align 2 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll index 7b4e2b0..1bb87bf 100644 --- a/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll @@ -7,9 +7,9 @@ define void @test(ptr %a, i8 %0, i16 %b.promoted.i) { ; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP0]] to i128 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[B_PROMOTED_I]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i128> poison, i128 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i128> [[TMP5]], <4 x i128> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = trunc <4 x i128> [[TMP6]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP2]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i16> poison, i16 [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i16> [[TMP4]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> [[TMP8]]) ; CHECK-NEXT: [[TMP11:%.*]] = zext i16 [[TMP9]] to i64 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll index 668d3c3..0ab5627 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll @@ -16,8 +16,7 @@ define void @test() { ; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1> ; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP16]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP15]] to <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> , <4 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]]) -- cgit v1.1