diff options
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 30 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.cpp | 40 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.h | 14 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 1132 |
4 files changed, 510 insertions, 706 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b5c8763..9685e7d1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7440,7 +7440,7 @@ static void createAndCollectMergePhiForReduction( const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); Value *FinalValue = - State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane())); + State.get(RedResult, VPIteration(0, VPLane::getFirstLane())); auto *ResumePhi = dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind( @@ -9453,24 +9453,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) { } if (IsUniform) { - // If the recipe is uniform across all parts (instead of just per VF), only - // generate a single instance. - if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) && - all_of(operands(), - [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) { - State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State); - if (user_begin() != user_end()) { - for (unsigned Part = 1; Part < State.UF; ++Part) - State.set(this, State.get(this, VPIteration(0, 0)), - VPIteration(Part, 0)); - } - return; - } - - // Uniform within VL means we need to generate lane 0 only for each - // unrolled copy. - for (unsigned Part = 0; Part < State.UF; ++Part) - State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State); + // Uniform within VL means we need to generate lane 0. + State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State); return; } @@ -9479,17 +9463,15 @@ void VPReplicateRecipe::execute(VPTransformState &State) { if (isa<StoreInst>(UI) && vputils::isUniformAfterVectorization(getOperand(1))) { auto Lane = VPLane::getLastLaneForVF(State.VF); - State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), - State); + State.ILV->scalarizeInstruction(UI, this, VPIteration(0, Lane), State); return; } // Generate scalar instances for all VF lanes of all UF parts. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); const unsigned EndLane = State.VF.getKnownMinValue(); - for (unsigned Part = 0; Part < State.UF; ++Part) - for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); + for (unsigned Lane = 0; Lane < EndLane; ++Lane) + State.ILV->scalarizeInstruction(UI, this, VPIteration(0, Lane), State); } // Determine how to lower the scalar epilogue, which depends on 1) optimising diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index b0a6370..9a0c353 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -225,7 +225,7 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, InnerLoopVectorizer *ILV, VPlan *Plan) - : VF(VF), UF(UF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan), + : VF(VF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan), LVer(nullptr), TypeAnalysis(Plan->getCanonicalIV()->getScalarType()) {} Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) { @@ -772,9 +772,6 @@ void VPRegionBlock::execute(VPTransformState *State) { // Enter replicating mode. State->Instance = VPIteration(0, 0); - - for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) { - State->Instance->Part = Part; assert(!State->VF.isScalable() && "VF is assumed to be non scalable."); for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF; ++Lane) { @@ -784,7 +781,6 @@ void VPRegionBlock::execute(VPTransformState *State) { LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); Block->execute(State); } - } } // Exit replicating mode. @@ -963,16 +959,15 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); // FIXME: Model VF * UF computation completely in VPlan. assert(VFxUF.getNumUsers() && "VFxUF expected to always have users"); + unsigned UF = getUF(); if (VF.getNumUsers()) { Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF); VF.setUnderlyingValue(RuntimeVF); VFxUF.setUnderlyingValue( - State.UF > 1 - ? Builder.CreateMul(RuntimeVF, ConstantInt::get(TCTy, State.UF)) - : RuntimeVF); + UF > 1 ? Builder.CreateMul(RuntimeVF, ConstantInt::get(TCTy, UF)) + : RuntimeVF); } else { - VFxUF.setUnderlyingValue( - createStepForVF(Builder, TCTy, State.VF, State.UF)); + VFxUF.setUnderlyingValue(createStepForVF(Builder, TCTy, State.VF, UF)); } // When vectorizing the epilogue loop, the canonical induction start value @@ -1019,10 +1014,6 @@ static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { /// Assumes a single pre-header basic-block was created for this. Introduce /// additional basic-blocks as needed, and fill them all. void VPlan::execute(VPTransformState *State) { - // Set UF to 1, as the unrollByUF VPlan transform already explicitly unrolled - // the VPlan. - // TODO: Remove State::UF and all uses. - State->UF = 1; // Initialize CFG state. State->CFG.PrevVPBB = nullptr; State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor(); @@ -1106,28 +1097,13 @@ void VPlan::execute(VPTransformState *State) { } auto *PhiR = cast<VPHeaderPHIRecipe>(&R); - // For canonical IV, first-order recurrences and in-order reduction phis, - // only a single part is generated, which provides the last part from the - // previous iteration. For non-ordered reductions all UF parts are - // generated. - bool SinglePartNeeded = - isa<VPCanonicalIVPHIRecipe>(PhiR) || - isa<VPFirstOrderRecurrencePHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) || - (isa<VPReductionPHIRecipe>(PhiR) && - cast<VPReductionPHIRecipe>(PhiR)->isOrdered()); bool NeedsScalar = isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) || (isa<VPReductionPHIRecipe>(PhiR) && cast<VPReductionPHIRecipe>(PhiR)->isInLoop()); - unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; - - for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Value *Phi = State->get(PhiR, Part, NeedsScalar); - Value *Val = - State->get(PhiR->getBackedgeValue(), - SinglePartNeeded ? State->UF - 1 : Part, NeedsScalar); - cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB); - } + Value *Phi = State->get(PhiR, 0, NeedsScalar); + Value *Val = State->get(PhiR->getBackedgeValue(), 0, NeedsScalar); + cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB); } State->CFG.DTU.flush(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index a29d972..fda0a89 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -256,7 +256,6 @@ struct VPTransformState { /// The chosen Vectorization and Unroll Factors of the loop being vectorized. ElementCount VF; - unsigned UF; /// Hold the indices to generate specific scalar instructions. Null indicates /// that all instances are to be generated, using either scalar or vector @@ -309,7 +308,7 @@ struct VPTransformState { assert((VF.isScalar() || V->getType()->isVectorTy()) && "scalar values must be stored as (Part, 0)"); if (!Data.PerPartOutput.count(Def)) { - DataState::PerPartValuesTy Entry(UF); + DataState::PerPartValuesTy Entry(1); Data.PerPartOutput[Def] = Entry; } Data.PerPartOutput[Def][Part] = V; @@ -1306,11 +1305,10 @@ private: /// needed. bool canGenerateScalarForFirstLane() const; - /// Utility methods serving execute(): generates a single instance of the - /// modeled instruction for a given part. \returns the generated value for \p - /// Part. In some cases an existing value is returned rather than a generated - /// one. - Value *generatePerPart(VPTransformState &State, unsigned Part); + /// Utility methods serving execute(): generates a single vector instance of + /// the modeled instruction. \returns the generated value. . In some cases an + /// existing value is returned rather than a generated one. + Value *generate(VPTransformState &State); /// Utility methods serving execute(): generates a scalar single instance of /// the modeled instruction for a given lane. \returns the scalar generated @@ -1616,7 +1614,7 @@ class VPScalarCastRecipe : public VPSingleDefRecipe { Type *ResultTy; - Value *generate(VPTransformState &State, unsigned Part); + Value *generate(VPTransformState &State); public: VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 255b8b0..8f4b2951 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -399,13 +399,13 @@ Value *VPInstruction::generatePerLane(VPTransformState &State, State.get(getOperand(1), Lane), Name); } -Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { +Value *VPInstruction::generate(VPTransformState &State) { IRBuilderBase &Builder = State.Builder; if (Instruction::isBinaryOp(getOpcode())) { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); - Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed); - Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed); + Value *A = State.get(getOperand(0), 0, OnlyFirstLaneUsed); + Value *B = State.get(getOperand(1), 0, OnlyFirstLaneUsed); auto *Res = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); if (auto *I = dyn_cast<Instruction>(Res)) @@ -415,26 +415,26 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { switch (getOpcode()) { case VPInstruction::Not: { - Value *A = State.get(getOperand(0), Part); + Value *A = State.get(getOperand(0), 0); return Builder.CreateNot(A, Name); } case Instruction::ICmp: { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); - Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed); - Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed); + Value *A = State.get(getOperand(0), 0, OnlyFirstLaneUsed); + Value *B = State.get(getOperand(1), 0, OnlyFirstLaneUsed); return Builder.CreateCmp(getPredicate(), A, B, Name); } case Instruction::Select: { - Value *Cond = State.get(getOperand(0), Part); - Value *Op1 = State.get(getOperand(1), Part); - Value *Op2 = State.get(getOperand(2), Part); + Value *Cond = State.get(getOperand(0), 0); + Value *Op1 = State.get(getOperand(1), 0); + Value *Op2 = State.get(getOperand(2), 0); return Builder.CreateSelect(Cond, Op1, Op2, Name); } case VPInstruction::ActiveLaneMask: { // Get first lane of vector induction variable. - Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); + Value *VIVElem0 = State.get(getOperand(0), VPIteration(0, 0)); // Get the original loop tripcount. - Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0)); + Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0)); // If this part of the active lane mask is scalar, generate the CMP directly // to avoid unnecessary extracts. @@ -461,18 +461,13 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { // v2 = a[i, i+1, i+2, i+3]; // v3 = vector(v1(3), v2(0, 1, 2)) - // For the first part, use the recurrence phi (v1), otherwise v2. auto *V1 = State.get(getOperand(0), 0); - Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); - if (!PartMinus1->getType()->isVectorTy()) - return PartMinus1; - Value *V2 = State.get(getOperand(1), Part); - return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name); + if (!V1->getType()->isVectorTy()) + return V1; + Value *V2 = State.get(getOperand(1), 0); + return Builder.CreateVectorSplice(V1, V2, -1, Name); } case VPInstruction::CalculateTripCountMinusVF: { - if (Part != 0) - return State.get(this, 0, /*IsScalar*/ true); - unsigned UF = getParent()->getPlan()->getUF(); Value *ScalarTC = State.get(getOperand(0), {0, 0}); Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF); @@ -498,7 +493,6 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { }; // TODO: Restructure this code with an explicit remainder loop, vsetvli can // be outside of the main loop. - assert(Part == 0 && "No unrolling expected for predicated vectorization."); // Compute VTC - IV as the AVL (requested vector length). Value *Index = State.get(getOperand(0), VPIteration(0, 0)); Value *TripCount = State.get(getOperand(1), VPIteration(0, 0)); @@ -517,10 +511,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { hasNoSignedWrap()); } case VPInstruction::BranchOnCond: { - if (Part != 0) - return nullptr; - - Value *Cond = State.get(getOperand(0), VPIteration(Part, 0)); + Value *Cond = State.get(getOperand(0), VPIteration(0, 0)); // Replace the temporary unreachable terminator with a new conditional // branch, hooking it up to backward destination for exiting blocks now and // to forward destination(s) later when they are created. @@ -538,11 +529,9 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { return CondBr; } case VPInstruction::BranchOnCount: { - if (Part != 0) - return nullptr; // First create the compare. - Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true); - Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true); + Value *IV = State.get(getOperand(0), 0, /*IsScalar*/ true); + Value *TC = State.get(getOperand(1), 0, /*IsScalar*/ true); Value *Cond = Builder.CreateICmpEQ(IV, TC); // Now create the branch. @@ -636,9 +625,6 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { return ReducedPartRdx; } case VPInstruction::ExtractFromEnd: { - if (Part != 0) - return State.get(this, 0, /*IsScalar*/ true); - auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue()); unsigned Offset = CI->getZExtValue(); assert(Offset > 0 && "Offset from end must be positive"); @@ -647,37 +633,34 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { assert(Offset <= State.VF.getKnownMinValue() && "invalid offset to extract from"); // Extract lane VF - Offset from the operand. - Res = State.get( - getOperand(0), - VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset))); + Res = State.get(getOperand(0), + VPIteration(0, VPLane::getLaneFromEnd(State.VF, Offset))); } else { - assert(Offset <= State.UF && "invalid offset to extract from"); + assert(Offset <= 1 && "invalid offset to extract from"); // When loop is unrolled without vectorizing, retrieve UF - Offset. - Res = State.get(getOperand(0), State.UF - Offset); + Res = State.get(getOperand(0), 1 - Offset); } if (isa<ExtractElementInst>(Res)) Res->setName(Name); return Res; } case VPInstruction::LogicalAnd: { - Value *A = State.get(getOperand(0), Part); - Value *B = State.get(getOperand(1), Part); + Value *A = State.get(getOperand(0), 0); + Value *B = State.get(getOperand(1), 0); return Builder.CreateLogicalAnd(A, B, Name); } case VPInstruction::PtrAdd: { assert(vputils::onlyFirstLaneUsed(this) && "can only generate first lane for PtrAdd"); - Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true); - Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true); + Value *Ptr = State.get(getOperand(0), 0, /* IsScalar */ true); + Value *Addend = State.get(getOperand(1), 0, /* IsScalar */ true); return Builder.CreatePtrAdd(Ptr, Addend, Name); } case VPInstruction::ResumePhi: { - if (Part != 0) - return State.get(this, 0, /*IsScalar*/ true); Value *IncomingFromVPlanPred = - State.get(getOperand(0), Part, /* IsScalar */ true); + State.get(getOperand(0), 0, /* IsScalar */ true); Value *IncomingFromOtherPreds = - State.get(getOperand(1), Part, /* IsScalar */ true); + State.get(getOperand(1), 0, /* IsScalar */ true); auto *NewPhi = Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name); BasicBlock *VPlanPred = @@ -730,36 +713,26 @@ void VPInstruction::execute(VPTransformState &State) { (vputils::onlyFirstLaneUsed(this) || isVectorToScalar() || isSingleScalar()); bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); - bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this); - for (unsigned Part = 0; Part < State.UF; ++Part) { - if (GeneratesPerAllLanes) { - for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); - Lane != NumLanes; ++Lane) { - Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane)); - assert(GeneratedValue && "generatePerLane must produce a value"); - State.set(this, GeneratedValue, VPIteration(Part, Lane)); - } - continue; + if (GeneratesPerAllLanes) { + for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); + Lane != NumLanes; ++Lane) { + Value *GeneratedValue = generatePerLane(State, VPIteration(0, Lane)); + assert(GeneratedValue && "generatePerLane must produce a value"); + State.set(this, GeneratedValue, VPIteration(0, Lane)); } - - if (Part != 0 && OnlyFirstPartUsed && hasResult()) { - Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly); - State.set(this, Part0, Part, - /*IsScalar*/ GeneratesPerFirstLaneOnly); - continue; - } - - Value *GeneratedValue = generatePerPart(State, Part); - if (!hasResult()) - continue; - assert(GeneratedValue && "generatePerPart must produce a value"); - assert((GeneratedValue->getType()->isVectorTy() == - !GeneratesPerFirstLaneOnly || - State.VF.isScalar()) && - "scalar value but not only first lane defined"); - State.set(this, GeneratedValue, Part, - /*IsScalar*/ GeneratesPerFirstLaneOnly); + return; } + + Value *GeneratedValue = generate(State); + if (!hasResult()) + return; + assert(GeneratedValue && "generate must produce a value"); + assert( + (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly || + State.VF.isScalar()) && + "scalar value but not only first lane defined"); + State.set(this, GeneratedValue, 0, + /*IsScalar*/ GeneratesPerFirstLaneOnly); } bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { @@ -893,7 +866,7 @@ void VPIRInstruction::execute(VPTransformState &State) { // Set insertion point in PredBB in case an extract needs to be generated. // TODO: Model extracts explicitly. State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); - Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane)); + Value *V = State.get(ExitValue, VPIteration(0, Lane)); auto *Phi = cast<PHINode>(&I); Phi->addIncoming(V, PredBB); } @@ -928,61 +901,59 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { FunctionType *VFTy = nullptr; if (Variant) VFTy = Variant->getFunctionType(); - for (unsigned Part = 0; Part < State.UF; ++Part) { - SmallVector<Type *, 2> TysForDecl; - // Add return type if intrinsic is overloaded on it. + SmallVector<Type *, 2> TysForDecl; + // Add return type if intrinsic is overloaded on it. + if (UseIntrinsic && + isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) + TysForDecl.push_back(VectorType::get( + CalledScalarFn->getReturnType()->getScalarType(), State.VF)); + SmallVector<Value *, 4> Args; + for (const auto &I : enumerate(arg_operands())) { + // Some intrinsics have a scalar argument - don't replace it with a + // vector. + Value *Arg; if (UseIntrinsic && - isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) - TysForDecl.push_back(VectorType::get( - CalledScalarFn->getReturnType()->getScalarType(), State.VF)); - SmallVector<Value *, 4> Args; - for (const auto &I : enumerate(arg_operands())) { - // Some intrinsics have a scalar argument - don't replace it with a - // vector. - Value *Arg; - if (UseIntrinsic && - isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) - Arg = State.get(I.value(), VPIteration(0, 0)); - // Some vectorized function variants may also take a scalar argument, - // e.g. linear parameters for pointers. This needs to be the scalar value - // from the start of the respective part when interleaving. - else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) - Arg = State.get(I.value(), VPIteration(Part, 0)); - else - Arg = State.get(I.value(), Part); - if (UseIntrinsic && - isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) - TysForDecl.push_back(Arg->getType()); - Args.push_back(Arg); - } + isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) + Arg = State.get(I.value(), VPIteration(0, 0)); + // Some vectorized function variants may also take a scalar argument, + // e.g. linear parameters for pointers. This needs to be the scalar value + // from the start of the respective part when interleaving. + else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) + Arg = State.get(I.value(), VPIteration(0, 0)); + else + Arg = State.get(I.value(), 0); + if (UseIntrinsic && + isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) + TysForDecl.push_back(Arg->getType()); + Args.push_back(Arg); + } - Function *VectorF; - if (UseIntrinsic) { - // Use vector version of the intrinsic. - Module *M = State.Builder.GetInsertBlock()->getModule(); - VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); - assert(VectorF && "Can't retrieve vector intrinsic."); - } else { + Function *VectorF; + if (UseIntrinsic) { + // Use vector version of the intrinsic. + Module *M = State.Builder.GetInsertBlock()->getModule(); + VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); + assert(VectorF && "Can't retrieve vector intrinsic."); + } else { #ifndef NDEBUG - assert(Variant != nullptr && "Can't create vector function."); + assert(Variant != nullptr && "Can't create vector function."); #endif - VectorF = Variant; - } + VectorF = Variant; + } - auto *CI = cast_or_null<CallInst>(getUnderlyingInstr()); - SmallVector<OperandBundleDef, 1> OpBundles; - if (CI) - CI->getOperandBundlesAsDefs(OpBundles); + auto *CI = cast_or_null<CallInst>(getUnderlyingInstr()); + SmallVector<OperandBundleDef, 1> OpBundles; + if (CI) + CI->getOperandBundlesAsDefs(OpBundles); - CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); + CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); - if (isa<FPMathOperator>(V)) - V->copyFastMathFlags(CI); + if (isa<FPMathOperator>(V)) + V->copyFastMathFlags(CI); - if (!V->getType()->isVoidTy()) - State.set(this, V, Part); - State.addMetadata(V, CI); - } + if (!V->getType()->isVoidTy()) + State.set(this, V, 0); + State.addMetadata(V, CI); } InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, @@ -1085,14 +1056,12 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) { auto *InvarCond = isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr; - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part); - Value *Op0 = State.get(getOperand(1), Part); - Value *Op1 = State.get(getOperand(2), Part); - Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); - State.set(this, Sel, Part); - State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); - } + Value *Cond = InvarCond ? InvarCond : State.get(getCond(), 0); + Value *Op0 = State.get(getOperand(1), 0); + Value *Op1 = State.get(getOperand(2), 0); + Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); + State.set(this, Sel, 0); + State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); } VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( @@ -1175,53 +1144,45 @@ void VPWidenRecipe::execute(VPTransformState &State) { case Instruction::Or: case Instruction::Xor: { // Just widen unops and binops. - for (unsigned Part = 0; Part < State.UF; ++Part) { - SmallVector<Value *, 2> Ops; - for (VPValue *VPOp : operands()) - Ops.push_back(State.get(VPOp, Part)); + SmallVector<Value *, 2> Ops; + for (VPValue *VPOp : operands()) + Ops.push_back(State.get(VPOp, 0)); - Value *V = Builder.CreateNAryOp(Opcode, Ops); + Value *V = Builder.CreateNAryOp(Opcode, Ops); - if (auto *VecOp = dyn_cast<Instruction>(V)) - setFlags(VecOp); - - // Use this vector value for all users of the original instruction. - State.set(this, V, Part); - State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); - } + if (auto *VecOp = dyn_cast<Instruction>(V)) + setFlags(VecOp); + // Use this vector value for all users of the original instruction. + State.set(this, V, 0); + State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); break; } case Instruction::Freeze: { - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *Op = State.get(getOperand(0), Part); + Value *Op = State.get(getOperand(0), 0); - Value *Freeze = Builder.CreateFreeze(Op); - State.set(this, Freeze, Part); - } + Value *Freeze = Builder.CreateFreeze(Op); + State.set(this, Freeze, 0); break; } case Instruction::ICmp: case Instruction::FCmp: { // Widen compares. Generate vector compares. bool FCmp = Opcode == Instruction::FCmp; - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *A = State.get(getOperand(0), Part); - Value *B = State.get(getOperand(1), Part); - Value *C = nullptr; - if (FCmp) { - // Propagate fast math flags. - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) - Builder.setFastMathFlags(I->getFastMathFlags()); - C = Builder.CreateFCmp(getPredicate(), A, B); - } else { - C = Builder.CreateICmp(getPredicate(), A, B); - } - State.set(this, C, Part); - State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); + Value *A = State.get(getOperand(0), 0); + Value *B = State.get(getOperand(1), 0); + Value *C = nullptr; + if (FCmp) { + // Propagate fast math flags. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) + Builder.setFastMathFlags(I->getFastMathFlags()); + C = Builder.CreateFCmp(getPredicate(), A, B); + } else { + C = Builder.CreateICmp(getPredicate(), A, B); } - + State.set(this, C, 0); + State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); break; } default: @@ -1234,11 +1195,9 @@ void VPWidenRecipe::execute(VPTransformState &State) { #if !defined(NDEBUG) // Verify that VPlan type inference results agree with the type of the // generated values. - for (unsigned Part = 0; Part < State.UF; ++Part) { - assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), - State.VF) == State.get(this, Part)->getType() && - "inferred type and type from generated instructions do not match"); - } + assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) == + State.get(this, 0)->getType() && + "inferred type and type from generated instructions do not match"); #endif } @@ -1323,8 +1282,6 @@ void VPWidenEVLRecipe::execute(VPTransformState &State) { llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute"); State.setDebugLocFrom(getDebugLoc()); - assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " - "explicit vector length."); assert(State.get(getOperand(0), 0)->getType()->isVectorTy() && "VPWidenEVLRecipe should not be used for scalars"); @@ -1381,17 +1338,10 @@ void VPWidenCastRecipe::execute(VPTransformState &State) { assert(State.VF.isVector() && "Not vectorizing?"); Type *DestTy = VectorType::get(getResultType(), State.VF); VPValue *Op = getOperand(0); - for (unsigned Part = 0; Part < State.UF; ++Part) { - if (Part > 0 && Op->isLiveIn()) { - // FIXME: Remove once explicit unrolling is implemented using VPlan. - State.set(this, State.get(this, 0), Part); - continue; - } - Value *A = State.get(Op, Part); - Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); - State.set(this, Cast, Part); - State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); - } + Value *A = State.get(Op, 0); + Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); + State.set(this, Cast, 0); + State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1553,19 +1503,14 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); VecInd->setDebugLoc(EntryVal->getDebugLoc()); - Instruction *LastInduction = VecInd; - for (unsigned Part = 0; Part < State.UF; ++Part) { - State.set(this, LastInduction, Part); + State.set(this, VecInd, 0); - if (isa<TruncInst>(EntryVal)) - State.addMetadata(LastInduction, EntryVal); - - LastInduction = cast<Instruction>( - Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); - LastInduction->setDebugLoc(EntryVal->getDebugLoc()); - } + Instruction *LastInduction = cast<Instruction>(Builder.CreateBinOp( + AddOp, VecInd, SplatVF, "vec.ind.next", EntryVal->getDebugLoc())); + if (isa<TruncInst>(EntryVal)) + State.addMetadata(LastInduction, EntryVal); + LastInduction->setDebugLoc(EntryVal->getDebugLoc()); - LastInduction->setName("vec.ind.next"); VecInd->addIncoming(SteppedStart, VectorPH); // Add induction update using an incorrect block temporarily. The phi node // will be fixed after VPlan execution. Note that at this point the latch @@ -1667,49 +1612,42 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) { SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); } - unsigned StartPart = 0; - unsigned EndPart = 1; unsigned StartLane = 0; unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); if (State.Instance) { - StartPart = State.Instance->Part; - EndPart = StartPart + 1; StartLane = State.Instance->Lane.getKnownLane(); EndLane = StartLane + 1; } - // TODO: Remove loop after VPlan-based unroller lands. - for (unsigned Part = StartPart; Part < EndPart; ++Part) { - Value *StartIdx0 = - createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart(*this)); - - if (!FirstLaneOnly && State.VF.isScalable()) { - auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); - auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); - if (BaseIVTy->isFloatingPointTy()) - InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); - auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); - auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); - State.set(this, Add, Part); - // It's useful to record the lane values too for the known minimum number - // of elements so we do those below. This improves the code quality when - // trying to extract the first element, for example. - } + Value *StartIdx0 = + createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart(*this)); + if (!FirstLaneOnly && State.VF.isScalable()) { + auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); + auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); if (BaseIVTy->isFloatingPointTy()) - StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); - - for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { - Value *StartIdx = Builder.CreateBinOp( - AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); - // The step returned by `createStepForVF` is a runtime-evaluated value - // when VF is scalable. Otherwise, it should be folded into a Constant. - assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && - "Expected StartIdx to be folded to a constant when VF is not " - "scalable"); - auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); - auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); - State.set(this, Add, VPIteration(Part, Lane)); - } + InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); + auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); + auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); + State.set(this, Add, 0); + // It's useful to record the lane values too for the known minimum number + // of elements so we do those below. This improves the code quality when + // trying to extract the first element, for example. + } + + if (BaseIVTy->isFloatingPointTy()) + StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); + + for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { + Value *StartIdx = Builder.CreateBinOp( + AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); + // The step returned by `createStepForVF` is a runtime-evaluated value + // when VF is scalable. Otherwise, it should be folded into a Constant. + assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && + "Expected StartIdx to be folded to a constant when VF is not " + "scalable"); + auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); + auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); + State.set(this, Add, VPIteration(0, Lane)); } } @@ -1752,46 +1690,37 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], ArrayRef(Ops).drop_front(), "", isInBounds()); - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP); - State.set(this, EntryPart, Part); - State.addMetadata(EntryPart, GEP); - } + Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP); + State.set(this, Splat, 0); + State.addMetadata(Splat, GEP); } else { // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers. But if we are only unrolling, we want - // to produce a scalar GEP for each unroll part. Thus, the GEP we - // produce with the code below will be scalar (if VF == 1) or vector - // (otherwise). Note that for the unroll-only case, we still maintain - // values in the vector mapping with initVector, as we do for other - // instructions. - for (unsigned Part = 0; Part < State.UF; ++Part) { - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = isPointerLoopInvariant() - ? State.get(getOperand(0), VPIteration(0, 0)) - : State.get(getOperand(0), Part); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector<Value *, 4> Indices; - for (unsigned I = 1, E = getNumOperands(); I < E; I++) { - VPValue *Operand = getOperand(I); - if (isIndexLoopInvariant(I - 1)) - Indices.push_back(State.get(Operand, VPIteration(0, 0))); - else - Indices.push_back(State.get(Operand, Part)); - } - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, - Indices, "", isInBounds()); - assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - State.set(this, NewGEP, Part); - State.addMetadata(NewGEP, GEP); + // produce a vector of pointers unless VF is scalar. + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = isPointerLoopInvariant() + ? State.get(getOperand(0), VPIteration(0, 0)) + : State.get(getOperand(0), 0); + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector<Value *, 4> Indices; + for (unsigned I = 1, E = getNumOperands(); I < E; I++) { + VPValue *Operand = getOperand(I); + if (isIndexLoopInvariant(I - 1)) + Indices.push_back(State.get(Operand, VPIteration(0, 0))); + else + Indices.push_back(State.get(Operand, 0)); } + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, + Indices, "", isInBounds()); + assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + State.set(this, NewGEP, 0); + State.addMetadata(NewGEP, GEP); } } @@ -1815,40 +1744,36 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) { auto &Builder = State.Builder; State.setDebugLocFrom(getDebugLoc()); unsigned CurrentPart = getUnrollPart(*this); - for (unsigned Part = 0; Part < State.UF; ++Part) { - // Calculate the pointer for the specific unroll-part. - Value *PartPtr = nullptr; - // Use i32 for the gep index type when the value is constant, - // or query DataLayout for a more suitable index type otherwise. - const DataLayout &DL = - Builder.GetInsertBlock()->getDataLayout(); - Type *IndexTy = State.VF.isScalable() && (IsReverse || CurrentPart > 0) - ? DL.getIndexType(IndexedTy->getPointerTo()) - : Builder.getInt32Ty(); - Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); - bool InBounds = isInBounds(); - if (IsReverse) { - // If the address is consecutive but reversed, then the - // wide store needs to start at the last vector element. - // RunTimeVF = VScale * VF.getKnownMinValue() - // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() - Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); - // NumElt = -CurrentPart * RunTimeVF - Value *NumElt = Builder.CreateMul( - ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF); - // LastLane = 1 - RunTimeVF - Value *LastLane = - Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); - PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); - PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds); - } else { - Value *Increment = - createStepForVF(Builder, IndexTy, State.VF, CurrentPart); - PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); - } - - State.set(this, PartPtr, Part, /*IsScalar*/ true); + // Use i32 for the gep index type when the value is constant, + // or query DataLayout for a more suitable index type otherwise. + const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout(); + Type *IndexTy = State.VF.isScalable() && (IsReverse || CurrentPart > 0) + ? DL.getIndexType(IndexedTy->getPointerTo()) + : Builder.getInt32Ty(); + Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); + bool InBounds = isInBounds(); + + Value *ResultPtr = nullptr; + if (IsReverse) { + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + // RunTimeVF = VScale * VF.getKnownMinValue() + // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() + Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); + // NumElt = -CurrentPart * RunTimeVF + Value *NumElt = Builder.CreateMul( + ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF); + // LastLane = 1 - RunTimeVF + Value *LastLane = + Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); + ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); + ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", InBounds); + } else { + Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); + ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); } + + State.set(this, ResultPtr, 0, /*IsScalar*/ true); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1883,27 +1808,22 @@ void VPBlendRecipe::execute(VPTransformState &State) { // In0))) // Note that Mask0 is never used: lanes for which no path reaches this phi and // are essentially undef are taken from In0. - VectorParts Entry(State.UF); bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); + Value *Result = nullptr; for (unsigned In = 0; In < NumIncoming; ++In) { - for (unsigned Part = 0; Part < State.UF; ++Part) { - // We might have single edge PHIs (blocks) - use an identity - // 'select' for the first PHI operand. - Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed); - if (In == 0) - Entry[Part] = In0; // Initialize with the first incoming value. - else { - // Select between the current value and the previous incoming edge - // based on the incoming mask. - Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed); - Entry[Part] = - State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); - } + // We might have single edge PHIs (blocks) - use an identity + // 'select' for the first PHI operand. + Value *In0 = State.get(getIncomingValue(In), 0, OnlyFirstLaneUsed); + if (In == 0) + Result = In0; // Initialize with the first incoming value. + else { + // Select between the current value and the previous incoming edge + // based on the incoming mask. + Value *Cond = State.get(getMask(In), 0, OnlyFirstLaneUsed); + Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi"); } } - - for (unsigned Part = 0; Part < State.UF; ++Part) - State.set(this, Entry[Part], Part, OnlyFirstLaneUsed); + State.set(this, Result, 0, OnlyFirstLaneUsed); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1937,57 +1857,51 @@ void VPReductionRecipe::execute(VPTransformState &State) { // Propagate the fast-math flags carried by the underlying instruction. IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *NewVecOp = State.get(getVecOp(), Part); - if (VPValue *Cond = getCondOp()) { - Value *NewCond = State.get(Cond, Part, State.VF.isScalar()); - VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); - Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); - - Value *Start; - if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind)) - Start = RdxDesc.getRecurrenceStartValue(); - else - Start = llvm::getRecurrenceIdentity(Kind, ElementTy, - RdxDesc.getFastMathFlags()); - if (State.VF.isVector()) - Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), - Start); - - Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start); - NewVecOp = Select; - } - Value *NewRed; - Value *NextInChain; - if (IsOrdered) { - if (State.VF.isVector()) - NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp, - PrevInChain); - else - NewRed = State.Builder.CreateBinOp( - (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, - NewVecOp); - PrevInChain = NewRed; - NextInChain = NewRed; - } else { - PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true); - NewRed = createReduction(State.Builder, RdxDesc, NewVecOp); - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) - NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), - NewRed, PrevInChain); - else - NextInChain = State.Builder.CreateBinOp( - (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, - PrevInChain); - } - State.set(this, NextInChain, Part, /*IsScalar*/ true); + Value *NewVecOp = State.get(getVecOp(), 0); + if (VPValue *Cond = getCondOp()) { + Value *NewCond = State.get(Cond, 0, State.VF.isScalar()); + VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); + Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); + + Value *Start; + if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind)) + Start = RdxDesc.getRecurrenceStartValue(); + else + Start = llvm::getRecurrenceIdentity(Kind, ElementTy, + RdxDesc.getFastMathFlags()); + if (State.VF.isVector()) + Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start); + + Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start); + NewVecOp = Select; + } + Value *NewRed; + Value *NextInChain; + if (IsOrdered) { + if (State.VF.isVector()) + NewRed = + createOrderedReduction(State.Builder, RdxDesc, NewVecOp, PrevInChain); + else + NewRed = State.Builder.CreateBinOp( + (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, + NewVecOp); + PrevInChain = NewRed; + NextInChain = NewRed; + } else { + PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true); + NewRed = createReduction(State.Builder, RdxDesc, NewVecOp); + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) + NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), + NewRed, PrevInChain); + else + NextInChain = State.Builder.CreateBinOp( + (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); } + State.set(this, NextInChain, 0, /*IsScalar*/ true); } void VPReductionEVLRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Reduction being replicated."); - assert(State.UF == 1 && - "Expected only UF == 1 when vectorizing with explicit vector length."); auto &Builder = State.Builder; // Propagate the fast-math flags carried by the underlying instruction. @@ -2112,18 +2026,7 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -/// Checks if \p C is uniform across all VFs and UFs. It is considered as such -/// if it is either defined outside the vector region or its operand is known to -/// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI). -/// TODO: Uniformity should be associated with a VPValue and there should be a -/// generic way to check. -static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) { - return C->isDefinedOutsideLoopRegions() || - isa<VPDerivedIVRecipe>(C->getOperand(0)) || - isa<VPCanonicalIVPHIRecipe>(C->getOperand(0)); -} - -Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) { +Value *VPScalarCastRecipe ::generate(VPTransformState &State) { assert(vputils::onlyFirstLaneUsed(this) && "Codegen only implemented for first lane."); switch (Opcode) { @@ -2131,7 +2034,7 @@ Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) { case Instruction::ZExt: case Instruction::Trunc: { // Note: SExt/ZExt not used yet. - Value *Op = State.get(getOperand(0), VPIteration(Part, 0)); + Value *Op = State.get(getOperand(0), VPIteration(0, 0)); return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy); } default: @@ -2140,17 +2043,7 @@ Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) { } void VPScalarCastRecipe ::execute(VPTransformState &State) { - bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this); - for (unsigned Part = 0; Part != State.UF; ++Part) { - Value *Res; - // Only generate a single instance, if the recipe is uniform across UFs and - // VFs. - if (Part > 0 && IsUniformAcrossVFsAndUFs) - Res = State.get(this, VPIteration(0, 0)); - else - Res = generate(State, Part); - State.set(this, Res, VPIteration(Part, 0)); - } + State.set(this, generate(State), VPIteration(0, 0)); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2167,13 +2060,12 @@ void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, void VPBranchOnMaskRecipe::execute(VPTransformState &State) { assert(State.Instance && "Branch on Mask works only on single instance."); - unsigned Part = State.Instance->Part; unsigned Lane = State.Instance->Lane.getKnownLane(); Value *ConditionBit = nullptr; VPValue *BlockInMask = getMask(); if (BlockInMask) { - ConditionBit = State.get(BlockInMask, Part); + ConditionBit = State.get(BlockInMask, 0); if (ConditionBit->getType()->isVectorTy()) ConditionBit = State.Builder.CreateExtractElement( ConditionBit, State.Builder.getInt32(Lane)); @@ -2206,20 +2098,19 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { // needed. In this case the recipe of the predicated instruction is marked to // also do that packing, thereby "hoisting" the insert-element sequence. // Otherwise, a phi node for the scalar value is needed. - unsigned Part = State.Instance->Part; - if (State.hasVectorValue(getOperand(0), Part)) { - Value *VectorValue = State.get(getOperand(0), Part); + if (State.hasVectorValue(getOperand(0), 0)) { + Value *VectorValue = State.get(getOperand(0), 0); InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. - if (State.hasVectorValue(this, Part)) - State.reset(this, VPhi, Part); + if (State.hasVectorValue(this, 0)) + State.reset(this, VPhi, 0); else - State.set(this, VPhi, Part); + State.set(this, VPhi, 0); // NOTE: Currently we need to update the value of the operand, so the next // predicated iteration inserts its generated value in the correct vector. - State.reset(getOperand(0), VPhi, Part); + State.reset(getOperand(0), VPhi, 0); } else { Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); @@ -2295,34 +2186,32 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; State.setDebugLocFrom(getDebugLoc()); - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *NewLI; - Value *Mask = nullptr; - if (auto *VPMask = getMask()) { - // Mask reversal is only needed for non-all-one (null) masks, as reverse - // of a null all-one mask is a null mask. - Mask = State.get(VPMask, Part); - if (isReverse()) - Mask = Builder.CreateVectorReverse(Mask, "reverse"); - } + Value *Mask = nullptr; + if (auto *VPMask = getMask()) { + // Mask reversal is only needed for non-all-one (null) masks, as reverse + // of a null all-one mask is a null mask. + Mask = State.get(VPMask, 0); + if (isReverse()) + Mask = Builder.CreateVectorReverse(Mask, "reverse"); + } - Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather); - if (CreateGather) { - NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, - "wide.masked.gather"); - } else if (Mask) { - NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, - PoisonValue::get(DataTy), - "wide.masked.load"); - } else { - NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); - } - // Add metadata to the load, but setVectorValue to the reverse shuffle. - State.addMetadata(NewLI, LI); - if (Reverse) - NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); - State.set(this, NewLI, Part); + Value *Addr = State.get(getAddr(), 0, /*IsScalar*/ !CreateGather); + Value *NewLI; + if (CreateGather) { + NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, + "wide.masked.gather"); + } else if (Mask) { + NewLI = + Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, + PoisonValue::get(DataTy), "wide.masked.load"); + } else { + NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); } + // Add metadata to the load, but setVectorValue to the reverse shuffle. + State.addMetadata(NewLI, LI); + if (Reverse) + NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); + State.set(this, NewLI, 0); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2347,8 +2236,6 @@ static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, } void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { - assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " - "explicit vector length."); auto *LI = cast<LoadInst>(&Ingredient); Type *ScalarDataTy = getLoadStoreType(&Ingredient); @@ -2409,34 +2296,32 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; State.setDebugLocFrom(getDebugLoc()); - for (unsigned Part = 0; Part < State.UF; ++Part) { - Instruction *NewSI = nullptr; - Value *Mask = nullptr; - if (auto *VPMask = getMask()) { - // Mask reversal is only needed for non-all-one (null) masks, as reverse - // of a null all-one mask is a null mask. - Mask = State.get(VPMask, Part); - if (isReverse()) - Mask = Builder.CreateVectorReverse(Mask, "reverse"); - } - - Value *StoredVal = State.get(StoredVPValue, Part); - if (isReverse()) { - // If we store to reverse consecutive memory locations, then we need - // to reverse the order of elements in the stored value. - StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); - // We don't want to update the value in the map as it might be used in - // another expression. So don't call resetVectorValue(StoredVal). - } - Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter); - if (CreateScatter) - NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); - else if (Mask) - NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); - else - NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); - State.addMetadata(NewSI, SI); - } + Value *Mask = nullptr; + if (auto *VPMask = getMask()) { + // Mask reversal is only needed for non-all-one (null) masks, as reverse + // of a null all-one mask is a null mask. + Mask = State.get(VPMask, 0); + if (isReverse()) + Mask = Builder.CreateVectorReverse(Mask, "reverse"); + } + + Value *StoredVal = State.get(StoredVPValue, 0); + if (isReverse()) { + // If we store to reverse consecutive memory locations, then we need + // to reverse the order of elements in the stored value. + StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); + // We don't want to update the value in the map as it might be used in + // another expression. So don't call resetVectorValue(StoredVal). + } + Value *Addr = State.get(getAddr(), 0, /*IsScalar*/ !CreateScatter); + Instruction *NewSI = nullptr; + if (CreateScatter) + NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); + else if (Mask) + NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); + else + NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); + State.addMetadata(NewSI, SI); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2448,8 +2333,6 @@ void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { - assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " - "explicit vector length."); auto *SI = cast<StoreInst>(&Ingredient); VPValue *StoredValue = getStoredValue(); @@ -2597,7 +2480,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); // Prepare for the new pointers. - SmallVector<Value *, 2> AddrParts; unsigned Index = Group->getIndex(Instr); // TODO: extend the masked interleaved-group support to reversed access. @@ -2624,41 +2506,38 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { Idx = State.Builder.getInt32(-Index); VPValue *Addr = getAddr(); - for (unsigned Part = 0; Part < State.UF; Part++) { - Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); - if (auto *I = dyn_cast<Instruction>(AddrPart)) - State.setDebugLocFrom(I->getDebugLoc()); - - // Notice current instruction could be any index. Need to adjust the address - // to the member of index 0. - // - // E.g. a = A[i+1]; // Member of index 1 (Current instruction) - // b = A[i]; // Member of index 0 - // Current pointer is pointed to A[i+1], adjust it to A[i]. - // - // E.g. A[i+1] = a; // Member of index 1 - // A[i] = b; // Member of index 0 - // A[i+2] = c; // Member of index 2 (Current instruction) - // Current pointer is pointed to A[i+2], adjust it to A[i]. - - bool InBounds = false; - if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) - InBounds = gep->isInBounds(); - AddrPart = State.Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); - AddrParts.push_back(AddrPart); - } + Value *ResAddr = State.get(Addr, VPIteration(0, 0)); + if (auto *I = dyn_cast<Instruction>(ResAddr)) + State.setDebugLocFrom(I->getDebugLoc()); + + // Notice current instruction could be any index. Need to adjust the address + // to the member of index 0. + // + // E.g. a = A[i+1]; // Member of index 1 (Current instruction) + // b = A[i]; // Member of index 0 + // Current pointer is pointed to A[i+1], adjust it to A[i]. + // + // E.g. A[i+1] = a; // Member of index 1 + // A[i] = b; // Member of index 0 + // A[i+2] = c; // Member of index 2 (Current instruction) + // Current pointer is pointed to A[i+2], adjust it to A[i]. + + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts())) + InBounds = gep->isInBounds(); + ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Idx, "", InBounds); State.setDebugLocFrom(Instr->getDebugLoc()); Value *PoisonVec = PoisonValue::get(VecTy); - auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor]( - unsigned Part, Value *MaskForGaps) -> Value * { + auto CreateGroupMask = [&BlockInMask, &State, + &InterleaveFactor](Value *MaskForGaps) -> Value * { if (State.VF.isScalable()) { assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); assert(InterleaveFactor == 2 && "Unsupported deinterleave factor for scalable vectors"); - auto *BlockInMaskPart = State.get(BlockInMask, Part); - SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; + auto *ResBlockInMask = State.get(BlockInMask, 0); + SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask}; auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), State.VF.getKnownMinValue() * 2, true); return State.Builder.CreateIntrinsic( @@ -2669,9 +2548,9 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { if (!BlockInMask) return MaskForGaps; - Value *BlockInMaskPart = State.get(BlockInMask, Part); + Value *ResBlockInMask = State.get(BlockInMask, 0); Value *ShuffledMask = State.Builder.CreateShuffleVector( - BlockInMaskPart, + ResBlockInMask, createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()), "interleaved.mask"); return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And, @@ -2689,21 +2568,16 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { assert(MaskForGaps && "Mask for Gaps is required but it is null"); } - // For each unroll part, create a wide load for the group. - SmallVector<Value *, 2> NewLoads; - for (unsigned Part = 0; Part < State.UF; Part++) { - Instruction *NewLoad; - if (BlockInMask || MaskForGaps) { - Value *GroupMask = CreateGroupMask(Part, MaskForGaps); - NewLoad = State.Builder.CreateMaskedLoad(VecTy, AddrParts[Part], - Group->getAlign(), GroupMask, - PoisonVec, "wide.masked.vec"); - } else - NewLoad = State.Builder.CreateAlignedLoad( - VecTy, AddrParts[Part], Group->getAlign(), "wide.vec"); - Group->addMetadata(NewLoad); - NewLoads.push_back(NewLoad); - } + Instruction *NewLoad; + if (BlockInMask || MaskForGaps) { + Value *GroupMask = CreateGroupMask(MaskForGaps); + NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr, + Group->getAlign(), GroupMask, + PoisonVec, "wide.masked.vec"); + } else + NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr, + Group->getAlign(), "wide.vec"); + Group->addMetadata(NewLoad); ArrayRef<VPValue *> VPDefs = definedValues(); const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); @@ -2711,34 +2585,31 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { assert(InterleaveFactor == 2 && "Unsupported deinterleave factor for scalable vectors"); - for (unsigned Part = 0; Part < State.UF; ++Part) { // Scalable vectors cannot use arbitrary shufflevectors (only splats), // so must use intrinsics to deinterleave. - Value *DI = State.Builder.CreateIntrinsic( - Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part], - /*FMFSource=*/nullptr, "strided.vec"); - unsigned J = 0; - for (unsigned I = 0; I < InterleaveFactor; ++I) { - Instruction *Member = Group->getMember(I); - - if (!Member) - continue; - - Value *StridedVec = State.Builder.CreateExtractValue(DI, I); - // If this member has different type, cast the result type. - if (Member->getType() != ScalarTy) { - VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); - StridedVec = - createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); - } - - if (Group->isReverse()) - StridedVec = - State.Builder.CreateVectorReverse(StridedVec, "reverse"); - - State.set(VPDefs[J], StridedVec, Part); - ++J; + Value *DI = State.Builder.CreateIntrinsic( + Intrinsic::vector_deinterleave2, VecTy, NewLoad, + /*FMFSource=*/nullptr, "strided.vec"); + unsigned J = 0; + for (unsigned I = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + + if (!Member) + continue; + + Value *StridedVec = State.Builder.CreateExtractValue(DI, I); + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); + StridedVec = + createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); } + + if (Group->isReverse()) + StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); + + State.set(VPDefs[J], StridedVec, 0); + ++J; } return; @@ -2756,23 +2627,21 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { auto StrideMask = createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue()); - for (unsigned Part = 0; Part < State.UF; Part++) { - Value *StridedVec = State.Builder.CreateShuffleVector( - NewLoads[Part], StrideMask, "strided.vec"); - - // If this member has different type, cast the result type. - if (Member->getType() != ScalarTy) { - assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); - VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); - StridedVec = - createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); - } + Value *StridedVec = + State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec"); + + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); + VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); + StridedVec = + createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); + } - if (Group->isReverse()) - StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); + if (Group->isReverse()) + StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); - State.set(VPDefs[J], StridedVec, Part); - } + State.set(VPDefs[J], StridedVec, 0); ++J; } return; @@ -2787,50 +2656,47 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { assert((!MaskForGaps || !State.VF.isScalable()) && "masking gaps for scalable vectors is not yet supported."); ArrayRef<VPValue *> StoredValues = getStoredValues(); - for (unsigned Part = 0; Part < State.UF; Part++) { - // Collect the stored vector from each member. - SmallVector<Value *, 4> StoredVecs; - unsigned StoredIdx = 0; - for (unsigned i = 0; i < InterleaveFactor; i++) { - assert((Group->getMember(i) || MaskForGaps) && - "Fail to get a member from an interleaved store group"); - Instruction *Member = Group->getMember(i); - - // Skip the gaps in the group. - if (!Member) { - Value *Undef = PoisonValue::get(SubVT); - StoredVecs.push_back(Undef); - continue; - } + // Collect the stored vector from each member. + SmallVector<Value *, 4> StoredVecs; + unsigned StoredIdx = 0; + for (unsigned i = 0; i < InterleaveFactor; i++) { + assert((Group->getMember(i) || MaskForGaps) && + "Fail to get a member from an interleaved store group"); + Instruction *Member = Group->getMember(i); + + // Skip the gaps in the group. + if (!Member) { + Value *Undef = PoisonValue::get(SubVT); + StoredVecs.push_back(Undef); + continue; + } - Value *StoredVec = State.get(StoredValues[StoredIdx], Part); - ++StoredIdx; + Value *StoredVec = State.get(StoredValues[StoredIdx], 0); + ++StoredIdx; - if (Group->isReverse()) - StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse"); + if (Group->isReverse()) + StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse"); - // If this member has different type, cast it to a unified type. + // If this member has different type, cast it to a unified type. - if (StoredVec->getType() != SubVT) - StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); + if (StoredVec->getType() != SubVT) + StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); - StoredVecs.push_back(StoredVec); - } + StoredVecs.push_back(StoredVec); + } - // Interleave all the smaller vectors into one wider vector. - Value *IVec = - interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); - Instruction *NewStoreInstr; - if (BlockInMask || MaskForGaps) { - Value *GroupMask = CreateGroupMask(Part, MaskForGaps); - NewStoreInstr = State.Builder.CreateMaskedStore( - IVec, AddrParts[Part], Group->getAlign(), GroupMask); - } else - NewStoreInstr = State.Builder.CreateAlignedStore(IVec, AddrParts[Part], - Group->getAlign()); + // Interleave all the smaller vectors into one wider vector. + Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); + Instruction *NewStoreInstr; + if (BlockInMask || MaskForGaps) { + Value *GroupMask = CreateGroupMask(MaskForGaps); + NewStoreInstr = State.Builder.CreateMaskedStore( + IVec, ResAddr, Group->getAlign(), GroupMask); + } else + NewStoreInstr = + State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign()); - Group->addMetadata(NewStoreInstr); - } + Group->addMetadata(NewStoreInstr); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2866,14 +2732,13 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { Value *Start = getStartValue()->getLiveInIRValue(); - PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index"); - EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); + PHINode *Phi = PHINode::Create(Start->getType(), 2, "index"); + Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); - EntryPart->addIncoming(Start, VectorPH); - EntryPart->setDebugLoc(getDebugLoc()); - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) - State.set(this, EntryPart, Part, /*IsScalar*/ true); + Phi->addIncoming(Start, VectorPH); + Phi->setDebugLoc(getDebugLoc()); + State.set(this, Phi, 0, /*IsScalar*/ true); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2964,28 +2829,25 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { NewPointerPhi->addIncoming(InductionGEP, VectorPH); } - // Create UF many actual address geps that use the pointer - // phi as base and a vectorized version of the step value - // (<step*0, ..., step*N>) as offset. - for (unsigned Part = 0; Part < State.UF; ++Part) { - Type *VecPhiType = VectorType::get(PhiType, State.VF); - Value *StartOffsetScalar = State.Builder.CreateMul( - RuntimeVF, ConstantInt::get(PhiType, CurrentPart)); - Value *StartOffset = - State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); - // Create a vector of consecutive numbers from zero to VF. - StartOffset = State.Builder.CreateAdd( - StartOffset, State.Builder.CreateStepVector(VecPhiType)); - - assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) && - "scalar step must be the same across all parts"); - Value *GEP = State.Builder.CreateGEP( - State.Builder.getInt8Ty(), NewPointerPhi, - State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat( - State.VF, ScalarStepValue)), - "vector.gep"); - State.set(this, GEP, Part); - } + // Create actual address geps that use the pointer phi as base and a + // vectorized version of the step value (<step*0, ..., step*N>) as offset. + Type *VecPhiType = VectorType::get(PhiType, State.VF); + Value *StartOffsetScalar = State.Builder.CreateMul( + RuntimeVF, ConstantInt::get(PhiType, CurrentPart)); + Value *StartOffset = + State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); + // Create a vector of consecutive numbers from zero to VF. + StartOffset = State.Builder.CreateAdd( + StartOffset, State.Builder.CreateStepVector(VecPhiType)); + + assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, 0)) && + "scalar step must be the same across all parts"); + Value *GEP = State.Builder.CreateGEP( + State.Builder.getInt8Ty(), NewPointerPhi, + State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat( + State.VF, ScalarStepValue)), + "vector.gep"); + State.set(this, GEP, 0); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3017,8 +2879,7 @@ void VPExpandSCEVRecipe::execute(VPTransformState &State) { assert(!State.ExpandedSCEVs.contains(Expr) && "Same SCEV expanded multiple times"); State.ExpandedSCEVs[Expr] = Res; - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) - State.set(this, Res, {Part, 0}); + State.set(this, Res, {0, 0}); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3038,16 +2899,14 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { Value *VStart = VF.isScalar() ? CanonicalIV : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { - Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this)); - if (VF.isVector()) { - VStep = Builder.CreateVectorSplat(VF, VStep); - VStep = - Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); - } - Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); - State.set(this, CanonicalVectorIV, Part); + Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this)); + if (VF.isVector()) { + VStep = Builder.CreateVectorSplat(VF, VStep); + VStep = + Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); } + Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); + State.set(this, CanonicalVectorIV, 0); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3082,10 +2941,10 @@ void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { } // Create a phi node for the new recurrence. - PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur"); - EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); - EntryPart->addIncoming(VectorInit, VectorPH); - State.set(this, EntryPart, 0); + PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur"); + Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); + Phi->addIncoming(VectorInit, VectorPH); + State.set(this, Phi, 0); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3117,12 +2976,9 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { BasicBlock *HeaderBB = State.CFG.PrevBB; assert(State.CurrentVectorLoop->getHeader() == HeaderBB && "recipe must be in the vector loop header"); - unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; - for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi"); - EntryPart->insertBefore(HeaderBB->getFirstInsertionPt()); - State.set(this, EntryPart, Part, IsInLoop); - } + auto *Phi = PHINode::Create(VecTy, 2, "vec.phi"); + Phi->insertBefore(HeaderBB->getFirstInsertionPt()); + State.set(this, Phi, 0, IsInLoop); BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); @@ -3160,13 +3016,9 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { } } - for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Value *EntryPart = State.get(this, Part, IsInLoop); - // Make sure to add the reduction start value only to the - // first unroll part. - Value *StartVal = (CurrentPart == 0) ? StartV : Iden; - cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH); - } + Phi = cast<PHINode>(State.get(this, 0, IsInLoop)); + Value *StartVal = (CurrentPart == 0) ? StartV : Iden; + Phi->addIncoming(StartVal, VectorPH); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3215,14 +3067,12 @@ void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, // remove VPActiveLaneMaskPHIRecipe. void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { - Value *StartMask = State.get(getOperand(0), Part); - PHINode *EntryPart = - State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); - EntryPart->addIncoming(StartMask, VectorPH); - EntryPart->setDebugLoc(getDebugLoc()); - State.set(this, EntryPart, Part); - } + Value *StartMask = State.get(getOperand(0), 0); + PHINode *Phi = + State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); + Phi->addIncoming(StartMask, VectorPH); + Phi->setDebugLoc(getDebugLoc()); + State.set(this, Phi, 0); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3238,13 +3088,11 @@ void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) { BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); - assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization."); Value *Start = State.get(getOperand(0), VPIteration(0, 0)); - PHINode *EntryPart = - State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv"); - EntryPart->addIncoming(Start, VectorPH); - EntryPart->setDebugLoc(getDebugLoc()); - State.set(this, EntryPart, 0, /*IsScalar=*/true); + PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv"); + Phi->addIncoming(Start, VectorPH); + Phi->setDebugLoc(getDebugLoc()); + State.set(this, Phi, 0, /*IsScalar=*/true); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
