diff options
author | Samuel Tebbs <samuel.tebbs@arm.com> | 2025-06-03 11:36:22 +0100 |
---|---|---|
committer | Samuel Tebbs <samuel.tebbs@arm.com> | 2025-06-03 11:38:37 +0100 |
commit | f47a24c9877cc6d547031ecba26dda1e39cfbb38 (patch) | |
tree | 7f5b88b81d27f2f9b167afdf5bb88a683dda0ccb | |
parent | 91122e44387664a42282cb6c4097247fde6f132a (diff) | |
download | llvm-users/SamTebbs33/mulacc-partial-reductions.zip llvm-users/SamTebbs33/mulacc-partial-reductions.tar.gz llvm-users/SamTebbs33/mulacc-partial-reductions.tar.bz2 |
8 files changed, 109 insertions, 90 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 599bc81..0863dc7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2721,7 +2721,7 @@ public: VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1, Type *ResultTy, - unsigned ScaleFactor = 0) + unsigned ScaleFactor = 1) : VPReductionRecipe( VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(), {R->getChainOp(), Ext0->getOperand(0), Ext1->getOperand(0)}, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 4a60b19..e665192 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2572,7 +2572,7 @@ VPExtendedReductionRecipe::computeCost(ElementCount VF, InstructionCost VPMulAccumulateReductionRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { - if (getVFScaleFactor() > 0) { + if (getVFScaleFactor() > 1) { return Ctx.TTI.getPartialReductionCost( Instruction::Add, Ctx.Types.inferScalarType(getVecOp0()), Ctx.Types.inferScalarType(getVecOp1()), getResultType(), VF, @@ -2658,7 +2658,7 @@ void VPMulAccumulateReductionRecipe::print(raw_ostream &O, const Twine &Indent, O << " = "; getChainOp()->printAsOperand(O, SlotTracker); O << " + "; - if (getVFScaleFactor() > 0) + if (getVFScaleFactor() > 1) O << "partial."; O << "reduce." << Instruction::getOpcodeName( diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 518fb7b..89228d9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2583,7 +2583,7 @@ expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) { // Generate VPReductionRecipe. VPReductionRecipe *Red = nullptr; - if (unsigned ScaleFactor = MulAcc->getVFScaleFactor()) + if (unsigned ScaleFactor = MulAcc->getVFScaleFactor(); ScaleFactor > 1) Red = new VPPartialReductionRecipe(Instruction::Add, MulAcc->getChainOp(), Mul, MulAcc->getCondOp(), ScaleFactor); else @@ -2940,7 +2940,8 @@ tryToCreateAbstractPartialReductionRecipe(VPPartialReductionRecipe *PRed) { return; auto *AbstractR = new VPMulAccumulateReductionRecipe( - PRed, BinOpR, Ext0R, Ext1R, Ext0R->getResultType()); + PRed, BinOpR, Ext0R, Ext1R, Ext0R->getResultType(), + PRed->getVFScaleFactor()); AbstractR->insertBefore(PRed); PRed->replaceAllUsesWith(AbstractR); } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index ad4fe3e..3ab85e3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -22,7 +22,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 ; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index 617c296..0348d09 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -19,7 +19,7 @@ define i32 @dotp(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> @@ -49,7 +49,7 @@ define i32 @dotp(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 @@ -83,7 +83,7 @@ define i32 @dotp(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 ; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 ; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> @@ -800,37 +800,68 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]]) +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP36]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP37]]) ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP41]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP21]]) +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = mul nsw <16 x i32> [[TMP45]], [[TMP46]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP47]]) ; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP51]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP26]]) +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP56]], [[TMP39]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP40]]) ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP42]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP44]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = mul nsw <16 x i32> [[TMP48]], [[TMP49]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP50]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: @@ -915,6 +946,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 02b167f..a92eed9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -110,11 +110,10 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 ; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1 -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0 ; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP15]], align 1 ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32> @@ -262,10 +261,8 @@ define i64 @dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b) #1 ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP8]] -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 ; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1 ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0 @@ -1407,10 +1404,9 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP6]], 1 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 1 ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = or disjoint i64 [[INDEX]], 2 @@ -1636,7 +1632,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 ; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison) -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 ; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP15]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison) ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 16 x i32> @@ -1943,11 +1939,10 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 ; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1 -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1 ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0 ; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP12]], align 1 @@ -2567,10 +2562,8 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP11]] -; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 ; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1 ; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll index de086cc..9e0883c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -70,10 +70,6 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: IR %iv.next = add i64 %iv, 1 ; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 1024 ; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<exit>: -; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]> from middle.block) -; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK: VPlan 'Final VPlan for VF={8,16},UF={1}' { ; CHECK-NEXT: Live-in ir<[[EP_VFxUF:.+]]> = VF * UF @@ -87,18 +83,16 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph> ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<vector.ph>: -; CHECK-NEXT: Successor(s): vector loop +; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: -; CHECK-NEXT: <x1> vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[EP_IV:%.+]]> = phi ir<0>, vp<%index.next> +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT-SCALAR vp<[[EP_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ] ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi ir<0>, ir<%add> (VF scaled by 1/4) -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[EP_IV]]>, ir<1> -; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]> +; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]> ; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a> ; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]> ; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = sext ir<%load.a> to i32 -; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]> +; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[EP_IV]]> ; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b> ; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]> ; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 @@ -106,13 +100,11 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul> ; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16> ; CHECK-NEXT: EMIT branch-on-count vp<[[EP_IV_NEXT]]>, ir<1024> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): middle.block +; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, ir<%add> -; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1> +; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-last-element vp<[[RED_RESULT]]> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, ir<1024> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph> @@ -122,8 +114,8 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<scalar.ph>: -; CHECK-NEXT: EMIT vp<[[EP_RESUME:%.+]]> = resume-phi ir<1024>, ir<0> -; CHECK-NEXT: EMIT vp<[[EP_MERGE:%.+]]> = resume-phi vp<[[RED_RESULT]]>, ir<0> +; CHECK-NEXT: EMIT-SCALAR vp<[[EP_RESUME:%.+]]> = resume-phi ir<1024>, ir<0>, ir<0> +; CHECK-NEXT: EMIT-SCALAR vp<[[EP_MERGE:%.+]]> = resume-phi vp<[[RED_RESULT]]>, ir<0>, ir<0> ; CHECK-NEXT: Successor(s): ir-bb<for.body> ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<for.body>: @@ -164,12 +156,13 @@ exit: define i32 @print_bundled_partial_reduction(ptr %a, ptr %b) { ; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.]]> = VF ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<entry>: -; CHECK-NEXT: Successor(s): vector.ph +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: Successor(s): vector loop @@ -177,7 +170,7 @@ define i32 @print_bundled_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: <x1> vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, vp<[[REDUCE:%.+]]> (VF scaled by 1/4) +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4) ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a> @@ -185,7 +178,7 @@ define i32 @print_bundled_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b> ; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]> -; CHECK-NEXT: MULACC-REDUCE vp<[[REDUCE:%.+]]> = ir<%accum> + partial.reduce.add (mul (ir<%load.b> extended to i32), (ir<%load.a> extended to i32)) +; CHECK-NEXT: MULACC-REDUCE ir<[[REDUCE:%.+]]> = ir<%accum> + partial.reduce.add (mul (ir<%load.b> zext to i32), (ir<%load.a> zext to i32)) ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -193,15 +186,19 @@ define i32 @print_bundled_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, vp<[[REDUCE]]> -; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1> -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<%1> +; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]> +; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-last-element vp<[[RED_RESULT]]> +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<[[VEC_TC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph ; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<exit>: +; CHECK-NEXT: IR %add.lcssa = phi i32 [ [[REDUCE]], %for.body ] (extra operand: vp<[[EXTRACT]]> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: -; CHECK-NEXT: EMIT vp<%bc.resume.val> = resume-phi vp<[[VEC_TC]]>, ir<0> -; CHECK-NEXT: EMIT vp<%bc.merge.rdx> = resume-phi vp<[[RED_RESULT]]>, ir<0> +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = resume-phi vp<[[VEC_TC]]>, ir<0> +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = resume-phi vp<[[RED_RESULT]]>, ir<0> ; CHECK-NEXT: Successor(s): ir-bb<for.body> ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<for.body>: @@ -233,30 +230,26 @@ define i32 @print_bundled_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: ir-bb<vector.ph>: ; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: -; CHECK-NEXT: <x1> vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[EP_IV:%.+]]> = phi ir<0>, vp<%index.next> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi ir<0>, vp<%7> (VF scaled by 1/4) -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[EP_IV]]>, ir<1> -; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]> -; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a> -; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]> -; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]> -; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b> -; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]> -; CHECK-NEXT: WIDEN-CAST vp<%4> = zext ir<%load.b> to i32 -; CHECK-NEXT: WIDEN-CAST vp<%5> = zext ir<%load.a> to i32 -; CHECK-NEXT: WIDEN vp<%6> = mul vp<%4>, vp<%5> -; CHECK-NEXT: PARTIAL-REDUCE vp<[[REDUCE:%.+]]> = add ir<%accum>, vp<%6> -; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16> -; CHECK-NEXT: EMIT branch-on-count vp<[[EP_IV_NEXT]]>, ir<1024> -; CHECK-NEXT: No successors -; CHECK-NEXT: } +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT-SCALAR vp<[[EP_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi ir<0>, vp<[[REDUCE:%.+]]> (VF scaled by 1/4) +; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]> +; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a> +; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]> +; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[EP_IV]]> +; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b> +; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]> +; CHECK-NEXT: WIDEN-CAST vp<[[EXT_B:%.+]]> = zext ir<%load.b> to i32 +; CHECK-NEXT: WIDEN-CAST vp<[[EXT_A:%.+]]> = zext ir<%load.a> to i32 +; CHECK-NEXT: WIDEN vp<[[MUL:%.+]]> = mul vp<[[EXT_B]]>, vp<[[EXT_A]]> +; CHECK-NEXT: PARTIAL-REDUCE vp<[[REDUCE]]> = add ir<%accum>, vp<[[MUL]]> +; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16> +; CHECK-NEXT: EMIT branch-on-count vp<[[EP_IV_NEXT]]>, ir<1024> ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%9> = compute-reduction-result ir<%accum>, vp<%7> -; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<%9>, ir<1> +; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, vp<[[REDUCE]]> +; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-last-element vp<[[RED_RESULT]]> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, ir<1024> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph> @@ -266,8 +259,8 @@ define i32 @print_bundled_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<scalar.ph>: -; CHECK-NEXT: EMIT vp<[[EP_RESUME:%.+]]> = resume-phi ir<1024>, ir<0> -; CHECK-NEXT: EMIT vp<[[EP_MERGE:%.+]]> = resume-phi vp<%9>, ir<0> +; CHECK-NEXT: EMIT-SCALAR vp<[[EP_RESUME:%.+]]> = resume-phi ir<1024>, ir<0>, ir<0> +; CHECK-NEXT: EMIT-SCALAR vp<[[EP_MERGE:%.+]]> = resume-phi vp<[[RED_RESULT]]>, ir<0>, ir<0> ; CHECK-NEXT: Successor(s): ir-bb<for.body> ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<for.body>: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll index 847c4ba..5214b0a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll @@ -65,11 +65,11 @@ define i32 @vqdot(ptr %a, ptr %b) #0 { ; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 ; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1 -; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> ; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0 ; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1 ; ZVQDOTQ-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> +; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> ; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]] ; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]]) ; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] @@ -133,18 +133,18 @@ define i32 @vqdot(ptr %a, ptr %b) #0 { ; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; FIXED-ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 ; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] -; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] ; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]]) +; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP12:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP12]] ; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]]) ; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -237,11 +237,11 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 { ; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 ; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1 -; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> ; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0 ; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1 ; ZVQDOTQ-NEXT: [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> +; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> ; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]] ; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]]) ; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] @@ -305,18 +305,18 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 { ; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; FIXED-ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 ; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] -; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] ; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]]) +; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP12:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP12]] ; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]]) ; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 |