diff options
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 62 | ||||
-rw-r--r-- | llvm/test/Transforms/LoopVectorize/iv_outside_user.ll | 6 |
2 files changed, 49 insertions, 19 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 395287b..3e3f5ad 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -562,21 +562,63 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step); } +static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) { + SetVector<VPUser *> Users(V->user_begin(), V->user_end()); + for (unsigned I = 0; I != Users.size(); ++I) { + VPRecipeBase *Cur = cast<VPRecipeBase>(Users[I]); + if (isa<VPHeaderPHIRecipe>(Cur)) + continue; + for (VPValue *V : Cur->definedValues()) + Users.insert(V->user_begin(), V->user_end()); + } + return Users.takeVector(); +} + /// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd /// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as /// VPWidenPointerInductionRecipe will generate vectors only. If some users /// require vectors while other require scalars, the scalar uses need to extract /// the scalars from the generated vectors (Note that this is different to how -/// int/fp inductions are handled). Also optimize VPWidenIntOrFpInductionRecipe, -/// if any of its users needs scalar values, by providing them scalar steps -/// built on the canonical scalar IV and update the original IV's users. This is -/// an optional optimization to reduce the needs of vector extracts. +/// int/fp inductions are handled). Legalize extract-from-ends using uniform +/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so +/// the correct end value is available. Also optimize +/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by +/// providing them scalar steps built on the canonical scalar IV and update the +/// original IV's users. This is an optional optimization to reduce the needs of +/// vector extracts. static void legalizeAndOptimizeInductions(VPlan &Plan) { + using namespace llvm::VPlanPatternMatch; SmallVector<VPRecipeBase *> ToRemove; VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); bool HasOnlyVectorVFs = !Plan.hasVF(ElementCount::getFixed(1)); VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); for (VPRecipeBase &Phi : HeaderVPBB->phis()) { + auto *PhiR = dyn_cast<VPHeaderPHIRecipe>(&Phi); + if (!PhiR) + break; + + // Check if any uniform VPReplicateRecipes using the phi recipe are used by + // ExtractFromEnd. Those must be replaced by a regular VPReplicateRecipe to + // ensure the final value is available. + // TODO: Remove once uniformity analysis is done on VPlan. + for (VPUser *U : collectUsersRecursively(PhiR)) { + auto *ExitIRI = dyn_cast<VPIRInstruction>(U); + VPValue *Op; + if (!ExitIRI || !match(ExitIRI->getOperand(0), + m_VPInstruction<VPInstruction::ExtractFromEnd>( + m_VPValue(Op), m_VPValue()))) + continue; + auto *RepR = dyn_cast<VPReplicateRecipe>(Op); + if (!RepR || !RepR->isUniform()) + continue; + assert(!RepR->isPredicated() && "RepR must not be predicated"); + Instruction *I = RepR->getUnderlyingInstr(); + auto *Clone = + new VPReplicateRecipe(I, RepR->operands(), /*IsUniform*/ false); + Clone->insertAfter(RepR); + RepR->replaceAllUsesWith(Clone); + } + // Replace wide pointer inductions which have only their scalars used by // PtrAdd(IndStart, ScalarIVSteps (0, Step)). if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) { @@ -1086,18 +1128,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, return true; } -static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) { - SetVector<VPUser *> Users(V->user_begin(), V->user_end()); - for (unsigned I = 0; I != Users.size(); ++I) { - VPRecipeBase *Cur = cast<VPRecipeBase>(Users[I]); - if (isa<VPHeaderPHIRecipe>(Cur)) - continue; - for (VPValue *V : Cur->definedValues()) - Users.insert(V->user_begin(), V->user_end()); - } - return Users.takeVector(); -} - void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { for (VPRecipeBase &R : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll index f03ca2d..482f731 100644 --- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll @@ -981,7 +981,6 @@ exit: } ; Test case for https://github.com/llvm/llvm-project/issues/121745. -; FIXME: At the moment an incorrect exit value is used for %iv.next. define i32 @test_iv_uniform_with_outside_use_scev_simplification(ptr %dst) { ; VEC-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification( ; VEC-SAME: ptr [[DST:%.*]]) { @@ -994,10 +993,12 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification(ptr %dst) { ; VEC: [[VECTOR_BODY]]: ; VEC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; VEC-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; VEC-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 1 ; VEC-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP0]] ; VEC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0 ; VEC-NEXT: store <2 x i16> zeroinitializer, ptr [[TMP2]], align 2 ; VEC-NEXT: [[TMP4:%.*]] = add i32 [[STEP_2]], [[TMP0]] +; VEC-NEXT: [[TMP5:%.*]] = add i32 [[STEP_2]], [[TMP6]] ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; VEC-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8 ; VEC-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} @@ -1014,7 +1015,7 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification(ptr %dst) { ; VEC-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 8 ; VEC-NEXT: br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], {{!llvm.loop ![0-9]+}} ; VEC: [[E_EXIT]]: -; VEC-NEXT: [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ] +; VEC-NEXT: [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ] ; VEC-NEXT: ret i32 [[RES]] ; ; INTERLEAVE-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification( @@ -1071,7 +1072,6 @@ e.exit: ret i32 %res } -; FIXME: At the moment an incorrect exit value is used for %iv.next. define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(ptr %dst) { ; VEC-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification_2( ; VEC-SAME: ptr [[DST:%.*]]) { |