diff options
-rw-r--r-- | llvm/include/llvm/Analysis/TargetTransformInfo.h | 1 | ||||
-rw-r--r-- | llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h | 18 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp | 35 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 130 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.h | 83 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 21 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 84 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 12 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanValue.h | 5 | ||||
-rw-r--r-- | llvm/test/Transforms/LoopVectorize/compress-idioms.ll | 260 | ||||
-rw-r--r-- | llvm/unittests/Transforms/Vectorize/VPlanTest.cpp | 8 |
12 files changed, 591 insertions, 74 deletions
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 1aed98e..bd30825 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1403,6 +1403,7 @@ public: Normal, ///< The cast is used with a normal load/store. Masked, ///< The cast is used with a masked load/store. GatherScatter, ///< The cast is used with a gather/scatter. + Compressed, ///< The cast is used with an expand load/compress store. Interleave, ///< The cast is used with an interleaved load/store. Reversed, ///< The cast is used with a reversed load/store. }; diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index d654ac3..757bff2 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -269,6 +269,10 @@ public: /// induction descriptor. using InductionList = MapVector<PHINode *, InductionDescriptor>; + /// MonotonicPHIList saves monotonic phi variables and maps them to the + /// monotonic phi descriptor. + using MonotonicPHIList = MapVector<PHINode *, MonotonicDescriptor>; + /// RecurrenceSet contains the phi nodes that are recurrences other than /// inductions and reductions. using RecurrenceSet = SmallPtrSet<const PHINode *, 8>; @@ -304,6 +308,11 @@ public: /// Returns the induction variables found in the loop. const InductionList &getInductionVars() const { return Inductions; } + /// Returns the monotonic phi variables found in the loop. + const MonotonicPHIList &getMonotonicPHIs() const { return MonotonicPHIs; } + + bool hasMonotonicPHIs() const { return !MonotonicPHIs.empty(); } + /// Return the fixed-order recurrences found in the loop. RecurrenceSet &getFixedOrderRecurrences() { return FixedOrderRecurrences; } @@ -361,6 +370,12 @@ public: /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965). int isConsecutivePtr(Type *AccessTy, Value *Ptr) const; + /// Returns true if Phi is monotonic variable. + bool isMonotonicPHI(PHINode *Phi) const; + + /// Check if memory access is compressed when vectorizing. + bool isCompressedPtr(Type *AccessTy, Value *Ptr, BasicBlock *BB) const; + /// Returns true if \p V is invariant across all loop iterations according to /// SCEV. bool isInvariant(Value *V) const; @@ -597,6 +612,9 @@ private: /// variables can be pointers. InductionList Inductions; + /// Holds all of the monotonic phi variables that we found in the loop. + MonotonicPHIList MonotonicPHIs; + /// Holds all the casts that participate in the update chain of the induction /// variables, and that have been proven to be redundant (possibly under a /// runtime guard). These casts can be ignored when creating the vectorized diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 8e09e6f..cdfd556 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -43,6 +43,10 @@ AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden, cl::desc("Enable recognition of non-constant strided " "pointer induction variables.")); +static cl::opt<bool> EnableMonotonicPatterns( + "lv-monotonic-patterns", cl::init(true), cl::Hidden, + cl::desc("Enable recognition of monotonic patterns.")); + static cl::opt<bool> HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden, cl::desc("Allow enabling loop hints to reorder " @@ -468,6 +472,30 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, return 0; } +bool LoopVectorizationLegality::isMonotonicPHI(PHINode *Phi) const { + return MonotonicPHIs.count(Phi); +} + +bool LoopVectorizationLegality::isCompressedPtr(Type *AccessTy, Value *Ptr, + BasicBlock *BB) const { + MonotonicDescriptor Desc; + if (!MonotonicDescriptor::isMonotonicVal(Ptr, TheLoop, Desc, *PSE.getSE())) + return false; + + // Check if memory operation will use the same mask as monotonic phi. + // TODO: relax restrictions of current implementation. + if (Desc.getPredicateEdge() != + MonotonicDescriptor::Edge(BB, BB->getUniqueSuccessor())) + return false; + + // Check if pointer step equals access size. + auto *Step = + dyn_cast<SCEVConstant>(Desc.getExpr()->getStepRecurrence(*PSE.getSE())); + if (!Step) + return false; + return Step->getAPInt() == BB->getDataLayout().getTypeAllocSize(AccessTy); +} + bool LoopVectorizationLegality::isInvariant(Value *V) const { return LAI->isInvariant(V); } @@ -874,6 +902,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { continue; } + MonotonicDescriptor MD; + if (EnableMonotonicPatterns && MonotonicDescriptor::isMonotonicPHI( + Phi, TheLoop, MD, *PSE.getSE())) { + MonotonicPHIs[Phi] = MD; + continue; + } + if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) { AllowedExit.insert(Phi); FixedOrderRecurrences.insert(Phi); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 490d0af..32f0d8b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1095,6 +1095,7 @@ public: CM_Widen_Reverse, // For consecutive accesses with stride -1. CM_Interleave, CM_GatherScatter, + CM_Compressed, CM_Scalarize, CM_VectorCall, CM_IntrinsicCall @@ -1308,9 +1309,9 @@ public: getDivRemSpeculationCost(Instruction *I, ElementCount VF) const; - /// Returns widening decision (CM_Widen or CM_Widen_Reverse) if \p I is a - /// memory instruction with consecutive access that can be widened, or - /// CM_Unknown otherwise. + /// Returns widening decision (CM_Widen, CM_Widen_Reverse or CM_Compressed) if + /// \p I is a memory instruction with consecutive access that can be widened, + /// or CM_Unknown otherwise. InstWidening memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); /// Returns true if \p I is a memory instruction in an interleaved-group @@ -3263,6 +3264,9 @@ LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, auto *Ptr = getLoadStorePointerOperand(I); auto *ScalarTy = getLoadStoreType(I); + if (Legal->isCompressedPtr(ScalarTy, Ptr, I->getParent())) + return CM_Compressed; + // In order to be widened, the pointer should be consecutive, first of all. auto Stride = Legal->isConsecutivePtr(ScalarTy, Ptr); if (!Stride) @@ -3372,9 +3376,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (IsUniformMemOpUse(I)) return true; - return (WideningDecision == CM_Widen || - WideningDecision == CM_Widen_Reverse || - WideningDecision == CM_Interleave); + return ( + WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse || + WideningDecision == CM_Interleave || WideningDecision == CM_Compressed); }; // Returns true if Ptr is the pointer operand of a memory access instruction @@ -3514,6 +3518,39 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { AddToWorklistIfAllowed(IndUpdate); } + // Handle monotonic phis (similarly to induction vars). + for (const auto &MonotonicPHI : Legal->getMonotonicPHIs()) { + auto *Phi = MonotonicPHI.first; + auto *PhiUpdate = cast<Instruction>(Phi->getIncomingValueForBlock(Latch)); + const auto &Desc = MonotonicPHI.second; + + auto UniformPhi = llvm::all_of(Phi->users(), [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + if (I == Desc.getStepInst()) + return true; + if (auto *PN = dyn_cast<PHINode>(I); PN && Desc.getChain().contains(PN)) + return true; + return !TheLoop->contains(I) || Worklist.count(I) || + IsVectorizedMemAccessUse(I, Phi); + }); + if (!UniformPhi) + continue; + + auto UniformPhiUpdate = + llvm::all_of(PhiUpdate->users(), [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + if (I == Phi) + return true; + return !TheLoop->contains(I) || Worklist.count(I) || + IsVectorizedMemAccessUse(I, Phi); + }); + if (!UniformPhiUpdate) + continue; + + AddToWorklistIfAllowed(Phi); + AddToWorklistIfAllowed(PhiUpdate); + } + Uniforms[VF].insert_range(Worklist); } @@ -4272,6 +4309,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPEVLBasedIVPHISC: case VPDef::VPPredInstPHISC: case VPDef::VPBranchOnMaskSC: + case VPDef::VPMonotonicPHISC: continue; case VPDef::VPReductionSC: case VPDef::VPActiveLaneMaskPHISC: @@ -4992,6 +5030,10 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, if (Legal->hasUncountableEarlyExit()) return 1; + // Monotonic vars don't support interleaving. + if (Legal->hasMonotonicPHIs()) + return 1; + const bool HasReductions = !Legal->getReductionVars().empty(); // If we did not calculate the cost for VF (because the user selected the VF) @@ -5577,12 +5619,17 @@ InstructionCost LoopVectorizationCostModel::getConsecutiveMemOpCost( Instruction *I, ElementCount VF, InstWidening Decision) { Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); + const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + if (Decision == CM_Compressed) + return TTI.getExpandCompressMemoryOpCost(I->getOpcode(), VectorTy, + /*VariableMask*/ true, Alignment, + CostKind, I); + assert((Decision == CM_Widen || Decision == CM_Widen_Reverse) && "Expected widen decision."); - const Align Alignment = getLoadStoreAlignment(I); InstructionCost Cost = 0; if (Legal->isMaskRequired(I)) { Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, @@ -6292,6 +6339,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, // the scalar version. if (isUniformAfterVectorization(I, VF)) VF = ElementCount::getFixed(1); + else if (auto *Phi = dyn_cast<PHINode>(I)) { + // Prohibit scalarization of monotonic phis. + if (Legal->isMonotonicPHI(Phi)) + return InstructionCost::getInvalid(); + } if (VF.isVector() && isProfitableToScalarize(I, VF)) return InstsToScalarize[VF][I]; @@ -6647,6 +6699,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, switch (getWideningDecision(I, VF)) { case LoopVectorizationCostModel::CM_GatherScatter: return TTI::CastContextHint::GatherScatter; + case LoopVectorizationCostModel::CM_Compressed: + return TTI::CastContextHint::Compressed; case LoopVectorizationCostModel::CM_Interleave: return TTI::CastContextHint::Interleave; case LoopVectorizationCostModel::CM_Scalarize: @@ -7238,6 +7292,16 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, } } + for (const auto &[MonotonicPhi, MonotonicDesc] : Legal->getMonotonicPHIs()) { + // TODO: currently, we restrict vectorization of non-uniform monotonic phis + // by reporting Invalid cost for it. This can be relaxed in future. + if (VF.isVector() && !CM.isUniformAfterVectorization(MonotonicPhi, VF)) + Cost = InstructionCost::getInvalid(); + else + Cost += TTI.getCFInstrCost(Instruction::PHI, CostCtx.CostKind); + CostCtx.SkipCostComputation.insert(MonotonicPhi); + } + // Pre-compute the costs for branches except for the backedge, as the number // of replicate regions in a VPlan may not directly match the number of // branches, which would lead to different decisions. @@ -8229,8 +8293,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, Range.Start); bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; + bool Compressed = Decision == LoopVectorizationCostModel::CM_Compressed; bool Consecutive = - Reverse || Decision == LoopVectorizationCostModel::CM_Widen; + Reverse || Compressed || Decision == LoopVectorizationCostModel::CM_Widen; VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; if (Consecutive) { @@ -8258,11 +8323,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, } if (LoadInst *Load = dyn_cast<LoadInst>(I)) return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, - VPIRMetadata(*Load, LVer), I->getDebugLoc()); + Compressed, VPIRMetadata(*Load, LVer), + I->getDebugLoc()); StoreInst *Store = cast<StoreInst>(I); return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, - Reverse, VPIRMetadata(*Store, LVer), + Reverse, Compressed, VPIRMetadata(*Store, LVer), I->getDebugLoc()); } @@ -8771,11 +8837,19 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, return Recipe; VPHeaderPHIRecipe *PhiRecipe = nullptr; - assert((Legal->isReductionVariable(Phi) || + assert((Legal->isMonotonicPHI(Phi) || Legal->isReductionVariable(Phi) || Legal->isFixedOrderRecurrence(Phi)) && - "can only widen reductions and fixed-order recurrences here"); + "can only widen monotonic phis, reductions and fixed-order " + "recurrences here"); VPValue *StartV = Operands[0]; - if (Legal->isReductionVariable(Phi)) { + Value *IncomingVal = + Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()); + if (Legal->isMonotonicPHI(Phi)) { + const MonotonicDescriptor &Desc = + Legal->getMonotonicPHIs().find(Phi)->second; + assert(Desc.getExpr()->getStart() == PSE.getSCEV(IncomingVal)); + PhiRecipe = new VPMonotonicPHIRecipe(Phi, Desc, StartV); + } else if (Legal->isReductionVariable(Phi)) { const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars().find(Phi)->second; assert(RdxDesc.getRecurrenceStartValue() == @@ -9397,6 +9471,27 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, // bring the VPlan to its final state. // --------------------------------------------------------------------------- + // Adjust the recipes for any monotonic phis. + for (VPRecipeBase &R : HeaderVPBB->phis()) { + auto *MonotonicPhi = dyn_cast<VPMonotonicPHIRecipe>(&R); + if (!MonotonicPhi) + continue; + + auto &Desc = MonotonicPhi->getDescriptor(); + auto [EdgeSrc, EdgeDst] = Desc.getPredicateEdge(); + auto &SE = *PSE.getSE(); + auto *Step = vputils::getOrCreateVPValueForSCEVExpr( + *Plan, Desc.getExpr()->getStepRecurrence(SE), SE); + + auto *MonotonicI = new VPInstruction( + VPInstruction::ComputeMonotonicResult, + {MonotonicPhi, RecipeBuilder.getEdgeMask(EdgeSrc, EdgeDst), Step}, + *Desc.getStepInst()); + auto *InsertBlock = MonotonicPhi->getBackedgeRecipe().getParent(); + InsertBlock->insert(MonotonicI, InsertBlock->getFirstNonPhi()); + MonotonicPhi->getBackedgeValue()->replaceAllUsesWith(MonotonicI); + } + // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); @@ -10587,6 +10682,15 @@ bool LoopVectorizePass::processLoop(Loop *L) { IC = CM.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost); unsigned SelectedIC = std::max(IC, UserIC); + + if (LVL.hasMonotonicPHIs() && SelectedIC > 1) { + reportVectorizationFailure( + "Interleaving of loop with monotonic vars", + "Interleaving of loops with monotonic vars is not supported", + "CantInterleaveWithMonotonicVars", ORE, L); + return false; + } + // Optimistically generate runtime checks if they are needed. Drop them if // they turn out to not be profitable. if (VF.Width.isVector() || SelectedIC > 1) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 06b738a..f5b2667 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -308,10 +308,11 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) { VPLane LastLane(IsSingleScalar ? 0 : VF.getKnownMinValue() - 1); // Check if there is a scalar value for the selected lane. if (!hasScalarValue(Def, LastLane)) { - // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and - // VPExpandSCEVRecipes can also be a single scalar. + // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes, + // VPMonotonicPHIRecipe and VPExpandSCEVRecipes can also be a single scalar. assert((isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe, - VPExpandSCEVRecipe>(Def->getDefiningRecipe())) && + VPMonotonicPHIRecipe, VPExpandSCEVRecipe>( + Def->getDefiningRecipe())) && "unexpected recipe found to be invariant"); IsSingleScalar = true; LastLane = 0; @@ -1005,6 +1006,7 @@ void VPlan::execute(VPTransformState *State) { auto *PhiR = cast<VPSingleDefRecipe>(&R); // VPInstructions currently model scalar Phis only. bool NeedsScalar = isa<VPInstruction>(PhiR) || + isa<VPMonotonicPHIRecipe>(PhiR) || (isa<VPReductionPHIRecipe>(PhiR) && cast<VPReductionPHIRecipe>(PhiR)->isInLoop()); Value *Phi = State->get(PhiR, NeedsScalar); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index e634de1..9ce743d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -539,6 +539,7 @@ public: case VPRecipeBase::VPWidenIntOrFpInductionSC: case VPRecipeBase::VPWidenPointerInductionSC: case VPRecipeBase::VPReductionPHISC: + case VPRecipeBase::VPMonotonicPHISC: case VPRecipeBase::VPPartialReductionSC: return true; case VPRecipeBase::VPBranchOnMaskSC: @@ -900,6 +901,7 @@ public: Broadcast, ComputeFindLastIVResult, ComputeReductionResult, + ComputeMonotonicResult, // Extracts the last lane from its operand if it is a vector, or the last // part if scalar. In the latter case, the recipe will be removed during // unrolling. @@ -965,6 +967,11 @@ private: #endif public: + VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, Instruction &I, + const Twine &Name = "") + : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, I), + Opcode(Opcode), Name(Name.str()) {} + VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL, const Twine &Name = "") : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL), @@ -2249,6 +2256,50 @@ public: } }; +/// A recipe for handling monotonic phis. The start value is the first operand +/// of the recipe and the incoming value from the backedge is the second +/// operand. +class VPMonotonicPHIRecipe : public VPHeaderPHIRecipe { + MonotonicDescriptor Desc; + +public: + VPMonotonicPHIRecipe(PHINode *Phi, const MonotonicDescriptor &Desc, + VPValue *Start) + : VPHeaderPHIRecipe(VPDef::VPMonotonicPHISC, Phi, Start), Desc(Desc) {} + + ~VPMonotonicPHIRecipe() override = default; + + VPMonotonicPHIRecipe *clone() override { + auto *R = new VPMonotonicPHIRecipe(cast<PHINode>(getUnderlyingInstr()), + Desc, getStartValue()); + R->addOperand(getBackedgeValue()); + return R; + } + + VP_CLASSOF_IMPL(VPDef::VPMonotonicPHISC) + + static inline bool classof(const VPHeaderPHIRecipe *R) { + return R->getVPDefID() == VPDef::VPMonotonicPHISC; + } + + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + const MonotonicDescriptor &getDescriptor() const { return Desc; } + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } +}; + /// A recipe for vectorizing a phi-node as a sequence of mask-based select /// instructions. class VPBlendRecipe : public VPSingleDefRecipe { @@ -2974,6 +3025,9 @@ protected: /// Whether the consecutive accessed addresses are in reverse order. bool Reverse; + /// Whether the consecutive accessed addresses are compressed with mask value. + bool Compressed; + /// Whether the memory access is masked. bool IsMasked = false; @@ -2987,11 +3041,12 @@ protected: VPWidenMemoryRecipe(const char unsigned SC, Instruction &I, std::initializer_list<VPValue *> Operands, - bool Consecutive, bool Reverse, + bool Consecutive, bool Reverse, bool Compressed, const VPIRMetadata &Metadata, DebugLoc DL) : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I), - Consecutive(Consecutive), Reverse(Reverse) { + Consecutive(Consecutive), Reverse(Reverse), Compressed(Compressed) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); + assert((Consecutive || !Compressed) && "Compressed implies consecutive"); } public: @@ -3018,6 +3073,9 @@ public: /// order. bool isReverse() const { return Reverse; } + /// Return whether the consecutive loaded/stored addresses are compressed. + bool isCompressed() const { return Compressed; } + /// Return the address accessed by this recipe. VPValue *getAddr() const { return getOperand(0); } @@ -3047,18 +3105,18 @@ public: /// optional mask. struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, - bool Consecutive, bool Reverse, + bool Consecutive, bool Reverse, bool Compressed, const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive, - Reverse, Metadata, DL), + Reverse, Compressed, Metadata, DL), VPValue(this, &Load) { setMask(Mask); } VPWidenLoadRecipe *clone() override { return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(), - getMask(), Consecutive, Reverse, *this, - getDebugLoc()); + getMask(), Consecutive, Reverse, Compressed, + *this, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC); @@ -3089,7 +3147,8 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(), {L.getAddr(), &EVL}, L.isConsecutive(), - L.isReverse(), L, L.getDebugLoc()), + L.isReverse(), L.isCompressed(), L, + L.getDebugLoc()), VPValue(this, &getIngredient()) { setMask(Mask); } @@ -3127,16 +3186,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe { VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal, VPValue *Mask, bool Consecutive, bool Reverse, - const VPIRMetadata &Metadata, DebugLoc DL) + bool Compressed, const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal}, - Consecutive, Reverse, Metadata, DL) { + Consecutive, Reverse, Compressed, Metadata, DL) { setMask(Mask); } VPWidenStoreRecipe *clone() override { return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(), getStoredValue(), getMask(), Consecutive, - Reverse, *this, getDebugLoc()); + Reverse, Compressed, *this, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC); @@ -3170,8 +3229,8 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(), {S.getAddr(), S.getStoredValue(), &EVL}, - S.isConsecutive(), S.isReverse(), S, - S.getDebugLoc()) { + S.isConsecutive(), S.isReverse(), S.isCompressed(), + S, S.getDebugLoc()) { setMask(Mask); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index ac0f30c..a562214 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -93,6 +93,11 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); return OrigPhi->getType(); } + case VPInstruction::ComputeMonotonicResult: { + auto *PhiR = cast<VPMonotonicPHIRecipe>(R->getOperand(0)); + auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); + return OrigPhi->getType(); + } case VPInstruction::ExplicitVectorLength: return Type::getIntNTy(Ctx, 32); case Instruction::PHI: @@ -266,14 +271,14 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe()) .Case<VPActiveLaneMaskPHIRecipe, VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe, - VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe>( - [this](const auto *R) { - // Handle header phi recipes, except VPWidenIntOrFpInduction - // which needs special handling due it being possibly truncated. - // TODO: consider inferring/caching type of siblings, e.g., - // backedge value, here and in cases below. - return inferScalarType(R->getStartValue()); - }) + VPMonotonicPHIRecipe, VPWidenPointerInductionRecipe, + VPEVLBasedIVPHIRecipe>([this](const auto *R) { + // Handle header phi recipes, except VPWidenIntOrFpInduction + // which needs special handling due it being possibly truncated. + // TODO: consider inferring/caching type of siblings, e.g., + // backedge value, here and in cases below. + return inferScalarType(R->getStartValue()); + }) .Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>( [](const auto *R) { return R->getScalarType(); }) .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 14ed40f..cc6c839 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -722,6 +722,34 @@ Value *VPInstruction::generate(VPTransformState &State) { return ReducedPartRdx; } + case VPInstruction::ComputeMonotonicResult: { + assert(getParent()->getPlan()->getUF() == 1 && + "Expected unroll factor of 1."); + + auto *Phi = State.get(getOperand(0), /*IsScalar*/ true); + auto *PhiTy = Phi->getType(); + Value *Mask = State.get(getOperand(1), 0); + auto *MaskTy = Mask->getType(); + assert(isa<VectorType>(MaskTy) && + cast<VectorType>(MaskTy)->getElementType()->isIntegerTy(1) && + "Mask type should be <N x i1>"); + + const auto &DL = State.CFG.PrevBB->getDataLayout(); + auto *IntTy = PhiTy->isIntegerTy() ? PhiTy : DL.getIndexType(PhiTy); + + auto *Step = State.get(getOperand(2), /*IsScalar*/ true); + + auto &Builder = State.Builder; + auto *NumElems = Builder.CreateAddReduce( + Builder.CreateZExt(Mask, MaskTy->getWithNewType(IntTy))); + auto *Offset = Builder.CreateMul(NumElems, Step); + + return PhiTy->isPointerTy() + ? Builder.CreatePtrAdd(Phi, Offset, "monotonic.add", + getGEPNoWrapFlags()) + : Builder.CreateAdd(Phi, Offset, "monotonic.add", + hasNoUnsignedWrap(), hasNoSignedWrap()); + } case VPInstruction::ExtractLastElement: case VPInstruction::ExtractPenultimateElement: { unsigned Offset = getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2; @@ -840,6 +868,12 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, I32Ty, {Arg0Ty, I32Ty, I1Ty}); return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); } + case VPInstruction::ComputeMonotonicResult: { + Type *ElementTy = Ctx.Types.inferScalarType(getOperand(0)); + auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF)); + return Ctx.TTI.getArithmeticReductionCost(Instruction::Add, VectorTy, + std::nullopt, Ctx.CostKind); + } default: // TODO: Compute cost other VPInstructions once the legacy cost model has // been retired. @@ -856,6 +890,7 @@ bool VPInstruction::isVectorToScalar() const { getOpcode() == VPInstruction::FirstActiveLane || getOpcode() == VPInstruction::ComputeFindLastIVResult || getOpcode() == VPInstruction::ComputeReductionResult || + getOpcode() == VPInstruction::ComputeMonotonicResult || getOpcode() == VPInstruction::AnyOf; } @@ -1053,6 +1088,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ComputeReductionResult: O << "compute-reduction-result"; break; + case VPInstruction::ComputeMonotonicResult: + O << "compute-monotonic-result"; + break; case VPInstruction::LogicalAnd: O << "logical-and"; break; @@ -2933,8 +2971,12 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, InstructionCost Cost = 0; if (IsMasked) { - Cost += - Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind); + Cost += Compressed + ? Ctx.TTI.getExpandCompressMemoryOpCost(Opcode, Ty, + /*VariableMask*/ true, + Alignment, Ctx.CostKind) + : Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, + Ctx.CostKind); } else { TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo( isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) ? getOperand(0) @@ -2972,9 +3014,13 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) { NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, "wide.masked.gather"); } else if (Mask) { - NewLI = - Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, - PoisonValue::get(DataTy), "wide.masked.load"); + NewLI = Compressed + ? Builder.CreateMaskedExpandLoad(DataTy, Addr, Alignment, Mask, + PoisonValue::get(DataTy), + "wide.masked.expand.load") + : Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, + PoisonValue::get(DataTy), + "wide.masked.load"); } else { NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); } @@ -3107,7 +3153,10 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) { if (CreateScatter) NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); else if (Mask) - NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); + NewSI = Compressed + ? Builder.CreateMaskedCompressStore(StoredVal, Addr, Alignment, + Mask) + : Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); else NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); applyMetadata(*NewSI); @@ -3907,6 +3956,29 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPMonotonicPHIRecipe::execute(VPTransformState &State) { + assert(getParent()->getPlan()->getUF() == 1 && "Expected unroll factor 1."); + Value *Start = getStartValue()->getLiveInIRValue(); + BasicBlock *VectorPH = + State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); + PHINode *MonotonicPHI = + State.Builder.CreatePHI(Start->getType(), 2, "monotonic.iv"); + MonotonicPHI->addIncoming(Start, VectorPH); + MonotonicPHI->setDebugLoc(getDebugLoc()); + State.set(this, MonotonicPHI, /*IsScalar=*/true); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPMonotonicPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "MONOTONIC-PHI "; + + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} +#endif + void VPWidenPHIRecipe::execute(VPTransformState &State) { assert(EnableVPlanNativePath && "Non-native vplans are not expected to have VPWidenPHIRecipes."); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 8c8297b..d2a3eef 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -80,13 +80,14 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) { NewRecipe = new VPWidenLoadRecipe( *Load, Ingredient.getOperand(0), nullptr /*Mask*/, - false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load), - Ingredient.getDebugLoc()); + false /*Consecutive*/, false /*Reverse*/, false /*Compressed*/, + VPIRMetadata(*Load), Ingredient.getDebugLoc()); } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) { NewRecipe = new VPWidenStoreRecipe( *Store, Ingredient.getOperand(1), Ingredient.getOperand(0), nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, - VPIRMetadata(*Store), Ingredient.getDebugLoc()); + false /*Compressed*/, VPIRMetadata(*Store), + Ingredient.getDebugLoc()); } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) { NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands()); } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) { @@ -3063,7 +3064,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, auto *L = new VPWidenLoadRecipe( *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()), LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, - /*Reverse=*/false, {}, LoadGroup->getDebugLoc()); + /*Reverse=*/false, /*Compressed=*/false, {}, + LoadGroup->getDebugLoc()); L->insertBefore(LoadGroup); return L; } @@ -3095,7 +3097,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, auto *S = new VPWidenStoreRecipe( *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()), StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true, - /*Reverse=*/false, {}, StoreGroup->getDebugLoc()); + /*Reverse=*/false, /*Compressed*/ false, {}, StoreGroup->getDebugLoc()); S->insertBefore(StoreGroup); StoreGroup->eraseFromParent(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 64065ed..29f2864 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -371,12 +371,13 @@ public: VPWidenIntOrFpInductionSC, VPWidenPointerInductionSC, VPReductionPHISC, + VPMonotonicPHISC, // END: SubclassID for recipes that inherit VPHeaderPHIRecipe // END: Phi-like recipes VPFirstPHISC = VPWidenPHISC, VPFirstHeaderPHISC = VPCanonicalIVPHISC, - VPLastHeaderPHISC = VPReductionPHISC, - VPLastPHISC = VPReductionPHISC, + VPLastHeaderPHISC = VPMonotonicPHISC, + VPLastPHISC = VPMonotonicPHISC, }; VPDef(const unsigned char SC) : SubclassID(SC) {} diff --git a/llvm/test/Transforms/LoopVectorize/compress-idioms.ll b/llvm/test/Transforms/LoopVectorize/compress-idioms.ll index 1390092..3d2dd45 100644 --- a/llvm/test/Transforms/LoopVectorize/compress-idioms.ll +++ b/llvm/test/Transforms/LoopVectorize/compress-idioms.ll @@ -5,18 +5,54 @@ define void @test_store_with_pointer(ptr writeonly %dst, ptr readonly %src, i32 ; CHECK-LABEL: define void @test_store_with_pointer( ; CHECK-SAME: ptr writeonly [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 ; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0 ; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] ; CHECK: [[FOR_BODY_PREHEADER]]: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[DST1]], [[SRC2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP12]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[MONOTONIC_IV:%.*]] = phi ptr [ [[DST]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[MONOTONIC_IV]], i32 0 +; CHECK-NEXT: call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr align 4 [[TMP4]], <4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; CHECK-NEXT: [[TMP9]] = getelementptr inbounds i8, ptr [[MONOTONIC_IV]], i64 [[TMP8]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi ptr [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[DST]], %[[FOR_BODY_PREHEADER]] ], [ [[DST]], %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: ; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] ; CHECK: [[FOR_COND_CLEANUP]]: ; CHECK-NEXT: ret void ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] -; CHECK-NEXT: [[DST_ADDR_09:%.*]] = phi ptr [ [[DST]], %[[FOR_BODY_PREHEADER]] ], [ [[DST_ADDR_1:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[DST_ADDR_09:%.*]] = phi ptr [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[DST_ADDR_1:%.*]], %[[FOR_INC]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]] @@ -29,7 +65,7 @@ define void @test_store_with_pointer(ptr writeonly %dst, ptr readonly %src, i32 ; CHECK-NEXT: [[DST_ADDR_1]] = phi ptr [ [[INCDEC_PTR]], %[[IF_THEN]] ], [ [[DST_ADDR_09]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; entry: %cmp8 = icmp sgt i32 %n, 0 @@ -69,18 +105,56 @@ define void @test_store_with_index(ptr writeonly %dst, ptr readonly %src, i32 %c ; CHECK-LABEL: define void @test_store_with_index( ; CHECK-SAME: ptr writeonly [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 ; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N]], 0 ; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] ; CHECK: [[FOR_BODY_PREHEADER]]: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[DST1]], [[SRC2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[MONOTONIC_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[MONOTONIC_ADD:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[MONOTONIC_IV]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; CHECK-NEXT: call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr align 4 [[TMP13]], <4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) +; CHECK-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 1 +; CHECK-NEXT: [[MONOTONIC_ADD]] = add nsw i32 [[MONOTONIC_IV]], [[TMP17]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP20]], 4 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[MONOTONIC_ADD]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: ; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] ; CHECK: [[FOR_COND_CLEANUP]]: ; CHECK-NEXT: ret void ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] -; CHECK-NEXT: [[IDX_012:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IDX_1:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[IDX_012:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[IDX_1:%.*]], %[[FOR_INC]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]] @@ -95,7 +169,7 @@ define void @test_store_with_index(ptr writeonly %dst, ptr readonly %src, i32 %c ; CHECK-NEXT: [[IDX_1]] = phi i32 [ [[INC]], %[[IF_THEN]] ], [ [[IDX_012]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; entry: %cmp11 = icmp sgt i32 %n, 0 @@ -141,14 +215,54 @@ define void @test_load_with_pointer(ptr %dst, ptr readonly %src, i32 %c, i32 %n) ; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] ; CHECK: [[FOR_BODY_PREHEADER]]: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP28:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 2 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP28]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP28]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[MONOTONIC_IV:%.*]] = phi ptr [ [[SRC]], %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META9:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[MONOTONIC_IV]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_EXPAND_LOAD:%.*]] = call <4 x i32> @llvm.masked.expandload.v4i32(ptr align 4 [[TMP4]], <4 x i1> [[TMP3]], <4 x i32> poison), !alias.scope [[META9]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP27]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[WIDE_MASKED_EXPAND_LOAD]], ptr [[TMP5]], i32 4, <4 x i1> [[TMP3]]), !alias.scope [[META6]], !noalias [[META9]] +; CHECK-NEXT: [[TMP21:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i64> +; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP21]]) +; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 +; CHECK-NEXT: [[TMP24]] = getelementptr inbounds i8, ptr [[MONOTONIC_IV]], i64 [[TMP23]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi ptr [ [[TMP24]], %[[MIDDLE_BLOCK]] ], [ [[SRC]], %[[FOR_BODY_PREHEADER]] ], [ [[SRC]], %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: ; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] ; CHECK: [[FOR_COND_CLEANUP]]: ; CHECK-NEXT: ret void ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] -; CHECK-NEXT: [[SRC_ADDR_09:%.*]] = phi ptr [ [[SRC]], %[[FOR_BODY_PREHEADER]] ], [ [[SRC_ADDR_1:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[SRC_ADDR_09:%.*]] = phi ptr [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SRC_ADDR_1:%.*]], %[[FOR_INC]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]] @@ -162,7 +276,7 @@ define void @test_load_with_pointer(ptr %dst, ptr readonly %src, i32 %c, i32 %n) ; CHECK-NEXT: [[SRC_ADDR_1]] = phi ptr [ [[INCDEC_PTR]], %[[IF_THEN]] ], [ [[SRC_ADDR_09]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; entry: %cmp8 = icmp sgt i32 %n, 0 @@ -207,14 +321,56 @@ define void @test_load_with_index(ptr %dst, ptr readonly %src, i32 %c, i32 %n) { ; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] ; CHECK: [[FOR_BODY_PREHEADER]]: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 2 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP4]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[MONOTONIC_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[MONOTONIC_ADD:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META13:![0-9]+]], !noalias [[META16:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[MONOTONIC_IV]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_EXPAND_LOAD:%.*]] = call <4 x i32> @llvm.masked.expandload.v4i32(ptr align 4 [[TMP13]], <4 x i1> [[TMP3]], <4 x i32> poison), !alias.scope [[META16]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP36]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[WIDE_MASKED_EXPAND_LOAD]], ptr [[TMP7]], i32 4, <4 x i1> [[TMP3]]), !alias.scope [[META13]], !noalias [[META16]] +; CHECK-NEXT: [[TMP29:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP29]]) +; CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[TMP31]], 1 +; CHECK-NEXT: [[MONOTONIC_ADD]] = add nsw i32 [[MONOTONIC_IV]], [[TMP32]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[MONOTONIC_ADD]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: ; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] ; CHECK: [[FOR_COND_CLEANUP]]: ; CHECK-NEXT: ret void ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] -; CHECK-NEXT: [[IDX_012:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IDX_1:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[IDX_012:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[IDX_1:%.*]], %[[FOR_INC]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]] @@ -230,7 +386,7 @@ define void @test_load_with_index(ptr %dst, ptr readonly %src, i32 %c, i32 %n) { ; CHECK-NEXT: [[IDX_1]] = phi i32 [ [[INC]], %[[IF_THEN]] ], [ [[IDX_012]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; entry: %cmp11 = icmp sgt i32 %n, 0 @@ -339,20 +495,58 @@ define i32 @test_multiple_uses(ptr writeonly %dst, ptr readonly %src, i32 %c, i3 ; CHECK-LABEL: define i32 @test_multiple_uses( ; CHECK-SAME: ptr writeonly [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 ; CHECK-NEXT: [[CMP12:%.*]] = icmp sgt i32 [[N]], 0 ; CHECK-NEXT: br i1 [[CMP12]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] ; CHECK: [[FOR_BODY_PREHEADER]]: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[DST1]], [[SRC2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP13]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[MONOTONIC_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[MONOTONIC_ADD:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[MONOTONIC_IV]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr align 4 [[TMP7]], <4 x i1> [[TMP4]]) +; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i1> [[TMP4]] to <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) +; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 1 +; CHECK-NEXT: [[MONOTONIC_ADD]] = add nsw i32 [[MONOTONIC_IV]], [[TMP11]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[MONOTONIC_ADD]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: -; CHECK-NEXT: [[IDX_1_LCSSA:%.*]] = phi i32 [ [[IDX_1:%.*]], %[[FOR_INC:.*]] ] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; CHECK-NEXT: [[IDX_1_LCSSA:%.*]] = phi i32 [ [[IDX_1:%.*]], %[[FOR_INC:.*]] ], [ [[MONOTONIC_ADD]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] ; CHECK: [[FOR_COND_CLEANUP]]: ; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IDX_1_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ] ; CHECK-NEXT: ret i32 [[IDX_0_LCSSA]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC]] ] -; CHECK-NEXT: [[IDX_013:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IDX_1]], %[[FOR_INC]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[IDX_013:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[IDX_1]], %[[FOR_INC]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]] @@ -367,7 +561,7 @@ define i32 @test_multiple_uses(ptr writeonly %dst, ptr readonly %src, i32 %c, i3 ; CHECK-NEXT: [[IDX_1]] = phi i32 [ [[INC]], %[[IF_THEN]] ], [ [[IDX_013]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; entry: %cmp12 = icmp sgt i32 %n, 0 @@ -478,3 +672,27 @@ for.inc: %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body } +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} +; CHECK: [[META6]] = !{[[META7:![0-9]+]]} +; CHECK: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]]} +; CHECK: [[META8]] = distinct !{[[META8]], !"LVerDomain"} +; CHECK: [[META9]] = !{[[META10:![0-9]+]]} +; CHECK: [[META10]] = distinct !{[[META10]], [[META8]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]]} +; CHECK: [[META13]] = !{[[META14:![0-9]+]]} +; CHECK: [[META14]] = distinct !{[[META14]], [[META15:![0-9]+]]} +; CHECK: [[META15]] = distinct !{[[META15]], !"LVerDomain"} +; CHECK: [[META16]] = !{[[META17:![0-9]+]]} +; CHECK: [[META17]] = distinct !{[[META17]], [[META15]]} +; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} +; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]]} +; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]} +; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]]} +;. diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index f0d943f..79195f4 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1084,7 +1084,7 @@ TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) { new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1)); VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); - VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {}); + VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, false, {}, {}); EXPECT_TRUE(isa<VPUser>(&Recipe)); VPRecipeBase *BaseR = &Recipe; EXPECT_TRUE(isa<VPUser>(BaseR)); @@ -1201,7 +1201,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1)); VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); - VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {}); + VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, false, {}, {}); EXPECT_FALSE(Recipe.mayHaveSideEffects()); EXPECT_TRUE(Recipe.mayReadFromMemory()); EXPECT_FALSE(Recipe.mayWriteToMemory()); @@ -1215,8 +1215,8 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *StoredV = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3)); - VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, {}, - {}); + VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, false, + {}, {}); EXPECT_TRUE(Recipe.mayHaveSideEffects()); EXPECT_FALSE(Recipe.mayReadFromMemory()); EXPECT_TRUE(Recipe.mayWriteToMemory()); |