diff options
author | Philip Reames <preames@rivosinc.com> | 2025-06-13 12:45:34 -0700 |
---|---|---|
committer | Philip Reames <listmail@philipreames.com> | 2025-06-13 12:50:10 -0700 |
commit | f5df231d8caece81fd800b921cf4fbd7774e2885 (patch) | |
tree | 73ef16de74adabdcb8a19cccc5523e3d4a5b79da /llvm | |
parent | 65eaed7d5a08210cd5b419f45845d5de81435d7e (diff) | |
download | llvm-f5df231d8caece81fd800b921cf4fbd7774e2885.zip llvm-f5df231d8caece81fd800b921cf4fbd7774e2885.tar.gz llvm-f5df231d8caece81fd800b921cf4fbd7774e2885.tar.bz2 |
[LV] Fix test line and regen an autogen test
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll | 611 |
1 files changed, 466 insertions, 145 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 7ec9749..b026e68 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 ; This is the loop in c++ being vectorize in this file with ;vector.reverse ; #pragma clang loop vectorize_width(4, scalable) @@ -46,66 +46,100 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 ; CHECK-NEXT: LV: Using user VF vscale x 4. -; CHECK: LV: Loop does not require scalar epilogue -; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: Creating VPBasicBlock for for.body +; CHECK-NEXT: VPlan 'Plain CFG +; CHECK-NEXT: for UF>=1' { +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<for.body.preheader>: +; CHECK-NEXT: IR %0 = zext i32 %n to i64 +; CHECK-NEXT: Successor(s): for.body +; CHECK-EMPTY: +; CHECK-NEXT: for.body: +; CHECK-NEXT: WIDEN-PHI ir<%indvars.iv> = phi [ ir<%indvars.iv.next>, for.body ], [ ir<%0>, ir-bb<for.body.preheader> ] +; CHECK-NEXT: WIDEN-PHI ir<%i.0.in8> = phi [ ir<%i.0>, for.body ], [ ir<%n>, ir-bb<for.body.preheader> ] +; CHECK-NEXT: EMIT ir<%i.0> = add ir<%i.0.in8>, ir<-1> +; CHECK-NEXT: EMIT ir<%idxprom> = zext ir<%i.0> +; CHECK-NEXT: EMIT ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom> +; CHECK-NEXT: EMIT ir<%1> = load ir<%arrayidx> +; CHECK-NEXT: EMIT ir<%add9> = add ir<%1>, ir<1> +; CHECK-NEXT: EMIT ir<%arrayidx3> = getelementptr ir<%A>, ir<%idxprom> +; CHECK-NEXT: EMIT store ir<%add9>, ir<%arrayidx3> +; CHECK-NEXT: EMIT ir<%cmp> = icmp ir<%indvars.iv>, ir<1> +; CHECK-NEXT: EMIT ir<%indvars.iv.next> = add ir<%indvars.iv>, ir<-1> +; CHECK-NEXT: EMIT branch-on-cond ir<%cmp> +; CHECK-NEXT: Successor(s): for.body, ir-bb<for.cond.cleanup.loopexit> +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: LV: Loop does not require scalar epilogue +; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF -; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count -; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: vp<%3> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<for.body.preheader>: ; CHECK-NEXT: IR %0 = zext i32 %n to i64 -; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64) +; CHECK-NEXT: EMIT vp<%3> = EXPAND SCEV (zext i32 %n to i64) ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + vp<[[VEC_TC]]> * ir<-1> -; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1> +; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%0> + vp<%2> * ir<-1> +; CHECK-NEXT: vp<%5> = DERIVED-IV ir<%n> + vp<%2> * ir<-1> ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: <x1> vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION -; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1> +; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1> +; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0> +; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%8>, ir<-1> ; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, vp<[[VF]]> -; CHECK-NEXT: WIDEN ir<%1> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0> +; CHECK-NEXT: WIDEN ir<%1> = load vp<%9> ; CHECK-NEXT: WIDEN ir<%add9> = add ir<%1>, ir<1> ; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, vp<[[VF]]> -; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0> +; CHECK-NEXT: WIDEN store vp<%10>, ir<%add9> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]> -; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>: ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: -; CHECK-NEXT: EMIT-SCALAR vp<[[RESUME1:%.+]]> = phi [ vp<[[END1]]>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ] -; CHECK-NEXT: EMIT-SCALAR vp<[[RESUME2:%.+]]>.1 = phi [ vp<[[END2]]>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%5>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ] ; CHECK-NEXT: Successor(s): ir-bb<for.body> ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<for.body>: -; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from scalar.ph) -; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph) -; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 +; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from scalar.ph) +; CHECK-NEXT: IR %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: IR %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: IR %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom +; CHECK-NEXT: IR %1 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: IR %add9 = add i32 %1, 1 +; CHECK-NEXT: IR %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom +; CHECK-NEXT: IR store i32 %add9, ptr %arrayidx3, align 4 +; CHECK-NEXT: IR %cmp = icmp ugt i64 %indvars.iv, 1 +; CHECK-NEXT: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] @@ -151,85 +185,212 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Interleaving is not beneficial. ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in <stdin> ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop -; CHECK: Executing best plan with VF=vscale x 4, UF=1 +; CHECK-NEXT: LV: Loop does not require scalar epilogue +; CHECK-NEXT: LV: Loop does not require scalar epilogue +; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 ; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' { -; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF -; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count -; CHECK-NEXT: ir<%0> = original trip-count +; CHECK-NEXT: Live-in ir<%18> = VF +; CHECK-NEXT: Live-in ir<%18>.1 = VF * UF +; CHECK-NEXT: Live-in ir<%n.vec> = vector-trip-count +; CHECK-NEXT: Live-in ir<%0> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<for.body.preheader>: ; CHECK-NEXT: IR %0 = zext i32 %n to i64 ; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck> ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<vector.scevcheck>: -; CHECK-NEXT: IR %3 = add nsw i64 %0, -1 -; CHECK-NEXT: IR %4 = add i32 %n, -1 -; CHECK-NEXT: IR %5 = trunc i64 %3 to i32 -; CHECK-NEXT: IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5) -; CHECK-NEXT: IR %mul.result = extractvalue { i32, i1 } %mul, 0 -; CHECK-NEXT: IR %mul.overflow = extractvalue { i32, i1 } %mul, 1 -; CHECK-NEXT: IR %6 = sub i32 %4, %mul.result -; CHECK-NEXT: IR %7 = icmp ugt i32 %6, %4 -; CHECK-NEXT: IR %8 = or i1 %7, %mul.overflow -; CHECK-NEXT: IR %9 = icmp ugt i64 %3, 4294967295 -; CHECK-NEXT: IR %10 = or i1 %8, %9 +; CHECK-NEXT: IR %3 = add nsw i64 %0, -1 +; CHECK-NEXT: IR %4 = add i32 %n, -1 +; CHECK-NEXT: IR %5 = trunc i64 %3 to i32 +; CHECK-NEXT: IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5) +; CHECK-NEXT: IR %mul.result = extractvalue { i32, i1 } %mul, 0 +; CHECK-NEXT: IR %mul.overflow = extractvalue { i32, i1 } %mul, 1 +; CHECK-NEXT: IR %6 = sub i32 %4, %mul.result +; CHECK-NEXT: IR %7 = icmp ugt i32 %6, %4 +; CHECK-NEXT: IR %8 = or i1 %7, %mul.overflow +; CHECK-NEXT: IR %9 = icmp ugt i64 %3, 4294967295 +; CHECK-NEXT: IR %10 = or i1 %8, %9 ; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck> ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<vector.memcheck>: -; CHECK-NEXT: IR %11 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: IR %12 = mul i64 %11, 4 -; CHECK-NEXT: IR %13 = mul i64 %12, 4 -; CHECK-NEXT: IR %14 = sub i64 %B1, %A2 -; CHECK-NEXT: IR %diff.check = icmp ult i64 %14, %13 +; CHECK-NEXT: IR %11 = call i64 @llvm.vscale.i64() +; CHECK-NEXT: IR %12 = mul i64 %11, 4 +; CHECK-NEXT: IR %13 = mul i64 %12, 4 +; CHECK-NEXT: IR %14 = sub i64 %B1, %A2 +; CHECK-NEXT: IR %diff.check = icmp ult i64 %14, %13 ; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph> ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<vector.ph>: -; CHECK-NEXT: IR %15 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: IR %16 = mul i64 %15, 4 -; CHECK-NEXT: IR %n.mod.vf = urem i64 %0, %16 -; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf -; CHECK-NEXT: IR %17 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: IR %18 = mul i64 %17, 4 -; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1> -; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1> +; CHECK-NEXT: IR %15 = call i64 @llvm.vscale.i64() +; CHECK-NEXT: IR %16 = mul i64 %15, 4 +; CHECK-NEXT: IR %n.mod.vf = urem i64 %0, %16 +; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf +; CHECK-NEXT: IR %17 = call i64 @llvm.vscale.i64() +; CHECK-NEXT: IR %18 = mul i64 %17, 4 +; CHECK-NEXT: vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1> +; CHECK-NEXT: vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1> ; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] -; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1> +; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%3>, ir<-1> ; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]> -; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN ir<%add9> = add ir<[[L]]>, ir<1> +; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18> +; CHECK-NEXT: WIDEN ir<%19> = load vp<%4> +; CHECK-NEXT: WIDEN ir<%add9> = add ir<%19>, ir<1> ; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]> -; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1 -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> +; CHECK-NEXT: vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18> +; CHECK-NEXT: WIDEN store vp<%5>, ir<%add9> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1 +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec> ; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%0>, ir<[[VEC_TC]]> -; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%0>, ir<%n.vec> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph> ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>: ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<scalar.ph>: -; CHECK-NEXT: EMIT-SCALAR vp<[[RESUME_1:%.+]]> = phi [ vp<[[END1]]>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ] -; CHECK-NEXT: EMIT-SCALAR vp<[[RESUME_2:%.+]]>.1 = phi [ vp<[[END2]]>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%1>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%2>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ] ; CHECK-NEXT: Successor(s): ir-bb<for.body> ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<for.body>: -; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME_1]]> from ir-bb<scalar.ph>) -; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME_2]]>.1 from ir-bb<scalar.ph>) -; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 +; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>) +; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from ir-bb<scalar.ph>) +; CHECK-NEXT: IR %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: IR %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: IR %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom +; CHECK-NEXT: IR %19 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: IR %add9 = add i32 %19, 1 +; CHECK-NEXT: IR %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom +; CHECK-NEXT: IR store i32 %add9, ptr %arrayidx3, align 4 +; CHECK-NEXT: IR %cmp = icmp ugt i64 %indvars.iv, 1 +; CHECK-NEXT: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: No successors ; CHECK-NEXT: } +; CHECK-NEXT: LV: vectorizing VPBB:ir-bb<for.body.preheader> in BB:for.body.preheader +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: for.body.preheader: ; preds = %entry +; CHECK-NEXT: %0 = zext i32 %n to i64 +; CHECK-NEXT: %1 = call i64 @llvm.vscale.i64() +; CHECK-NEXT: %2 = mul i64 %1, 4 +; CHECK-NEXT: %min.iters.check = icmp ult i64 %0, %2 +; CHECK-NEXT: br i1 %min.iters.check, label %scalar.ph, label %vector.scevcheck +; CHECK-NEXT: LV: vectorizing VPBB:ir-bb<vector.scevcheck> in BB:vector.scevcheck +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: vector.scevcheck: ; preds = %for.body.preheader +; CHECK-NEXT: %3 = add nsw i64 %0, -1 +; CHECK-NEXT: %4 = add i32 %n, -1 +; CHECK-NEXT: %5 = trunc i64 %3 to i32 +; CHECK-NEXT: %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5) +; CHECK-NEXT: %mul.result = extractvalue { i32, i1 } %mul, 0 +; CHECK-NEXT: %mul.overflow = extractvalue { i32, i1 } %mul, 1 +; CHECK-NEXT: %6 = sub i32 %4, %mul.result +; CHECK-NEXT: %7 = icmp ugt i32 %6, %4 +; CHECK-NEXT: %8 = or i1 %7, %mul.overflow +; CHECK-NEXT: %9 = icmp ugt i64 %3, 4294967295 +; CHECK-NEXT: %10 = or i1 %8, %9 +; CHECK-NEXT: br i1 %10, label %scalar.ph, label %vector.memcheck +; CHECK-NEXT: LV: draw edge fromfor.body.preheader +; CHECK-NEXT: LV: vectorizing VPBB:ir-bb<vector.memcheck> in BB:vector.memcheck +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: vector.memcheck: ; preds = %vector.scevcheck +; CHECK-NEXT: %11 = call i64 @llvm.vscale.i64() +; CHECK-NEXT: %12 = mul i64 %11, 4 +; CHECK-NEXT: %13 = mul i64 %12, 4 +; CHECK-NEXT: %14 = sub i64 %B1, %A2 +; CHECK-NEXT: %diff.check = icmp ult i64 %14, %13 +; CHECK-NEXT: br i1 %diff.check, label %scalar.ph, label %vector.ph +; CHECK-NEXT: LV: draw edge fromvector.scevcheck +; CHECK-NEXT: LV: vectorizing VPBB:ir-bb<vector.ph> in BB:vector.ph +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: vector.ph: ; preds = %vector.memcheck +; CHECK-NEXT: %15 = call i64 @llvm.vscale.i64() +; CHECK-NEXT: %16 = mul i64 %15, 4 +; CHECK-NEXT: %n.mod.vf = urem i64 %0, %16 +; CHECK-NEXT: %n.vec = sub i64 %0, %n.mod.vf +; CHECK-NEXT: %17 = call i64 @llvm.vscale.i64() +; CHECK-NEXT: %18 = mul i64 %17, 4 +; CHECK-NEXT: %19 = sub i64 %0, %n.vec +; CHECK-NEXT: %.cast = trunc i64 %n.vec to i32 +; CHECK-NEXT: %20 = sub i32 %n, %.cast +; CHECK-NEXT: br +; CHECK-NEXT: LV: draw edge fromvector.memcheck +; CHECK-NEXT: LV: created vector.body +; CHECK-NEXT: LV: draw edge fromvector.ph +; CHECK-NEXT: LV: vectorizing VPBB:vector.body in BB:vector.body +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: vector.body: ; preds = %vector.body, %vector.ph +; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ] +; CHECK-NEXT: %.cast3 = trunc i64 %index to i32 +; CHECK-NEXT: %offset.idx = sub i32 %n, %.cast3 +; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1 +; CHECK-NEXT: %22 = zext i32 %21 to i64 +; CHECK-NEXT: %23 = getelementptr inbounds i32, ptr %B, i64 %22 +; CHECK-NEXT: %24 = mul i64 0, %18 +; CHECK-NEXT: %25 = sub i64 1, %18 +; CHECK-NEXT: %26 = getelementptr inbounds i32, ptr %23, i64 %24 +; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %26, i64 %25 +; CHECK-NEXT: %wide.load = load <vscale x 4 x i32>, ptr %27, align 4 +; CHECK-NEXT: %reverse = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %wide.load) +; CHECK-NEXT: %28 = add <vscale x 4 x i32> %reverse, splat (i32 1) +; CHECK-NEXT: %29 = getelementptr inbounds i32, ptr %A, i64 %22 +; CHECK-NEXT: %30 = mul i64 0, %18 +; CHECK-NEXT: %31 = sub i64 1, %18 +; CHECK-NEXT: %32 = getelementptr inbounds i32, ptr %29, i64 %30 +; CHECK-NEXT: %33 = getelementptr inbounds i32, ptr %32, i64 %31 +; CHECK-NEXT: %reverse4 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %28) +; CHECK-NEXT: store <vscale x 4 x i32> %reverse4, ptr %33, align 4 +; CHECK-NEXT: %index.next = add nuw i64 %index, %18 +; CHECK-NEXT: %34 = icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 %34, <null operand!>, label %vector.body +; CHECK-NEXT: LV: created middle.block +; CHECK-NEXT: LV: draw edge fromvector.body +; CHECK-NEXT: LV: vectorizing VPBB:middle.block in BB:middle.block +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: middle.block: ; preds = %vector.body +; CHECK-NEXT: %cmp.n = icmp eq i64 %0, %n.vec +; CHECK-NEXT: br i1 %cmp.n, <null operand!>, <null operand!> +; CHECK-NEXT: LV: vectorizing VPBB:ir-bb<for.cond.cleanup.loopexit> in BB:for.cond.cleanup.loopexit +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: for.cond.cleanup.loopexit: ; preds = %for.body +; CHECK-NEXT: br label %for.cond.cleanup +; CHECK-NEXT: LV: draw edge frommiddle.block +; CHECK-NEXT: LV: vectorizing VPBB:ir-bb<scalar.ph> in BB:scalar.ph +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader +; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ] +; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] +; CHECK-NEXT: br label %for.body +; CHECK-NEXT: LV: draw edge frommiddle.block +; CHECK-NEXT: LV: draw edge fromfor.body.preheader +; CHECK-NEXT: LV: draw edge fromvector.scevcheck +; CHECK-NEXT: LV: draw edge fromvector.memcheck +; CHECK-NEXT: LV: vectorizing VPBB:ir-bb<for.body> in BB:for.body +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph +; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] +; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ] +; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom +; CHECK-NEXT: %35 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: %add9 = add i32 %35, 1 +; CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom +; CHECK-NEXT: store i32 %add9, ptr %arrayidx3, align 4 +; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1 +; CHECK-NEXT: %indvars.iv.next = add nsw i64 %indvars.iv, -1 +; CHECK-NEXT: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 +; CHECK-NEXT: LV: draw edge fromscalar.ph +; CHECK-NEXT: LV: Interleaving disabled by the pass manager +; CHECK-NEXT: LV: Vectorizing: innermost loop. +; CHECK-EMPTY: ; entry: %cmp7 = icmp sgt i32 %n, 0 @@ -294,66 +455,100 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 ; CHECK-NEXT: LV: Using user VF vscale x 4. -; CHECK: LV: Loop does not require scalar epilogue -; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: Creating VPBasicBlock for for.body +; CHECK-NEXT: VPlan 'Plain CFG +; CHECK-NEXT: for UF>=1' { +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<for.body.preheader>: +; CHECK-NEXT: IR %0 = zext i32 %n to i64 +; CHECK-NEXT: Successor(s): for.body +; CHECK-EMPTY: +; CHECK-NEXT: for.body: +; CHECK-NEXT: WIDEN-PHI ir<%indvars.iv> = phi [ ir<%indvars.iv.next>, for.body ], [ ir<%0>, ir-bb<for.body.preheader> ] +; CHECK-NEXT: WIDEN-PHI ir<%i.0.in8> = phi [ ir<%i.0>, for.body ], [ ir<%n>, ir-bb<for.body.preheader> ] +; CHECK-NEXT: EMIT ir<%i.0> = add ir<%i.0.in8>, ir<-1> +; CHECK-NEXT: EMIT ir<%idxprom> = zext ir<%i.0> +; CHECK-NEXT: EMIT ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom> +; CHECK-NEXT: EMIT ir<%1> = load ir<%arrayidx> +; CHECK-NEXT: EMIT ir<%conv1> = fadd ir<%1>, ir<1.000000e+00> +; CHECK-NEXT: EMIT ir<%arrayidx3> = getelementptr ir<%A>, ir<%idxprom> +; CHECK-NEXT: EMIT store ir<%conv1>, ir<%arrayidx3> +; CHECK-NEXT: EMIT ir<%cmp> = icmp ir<%indvars.iv>, ir<1> +; CHECK-NEXT: EMIT ir<%indvars.iv.next> = add ir<%indvars.iv>, ir<-1> +; CHECK-NEXT: EMIT branch-on-cond ir<%cmp> +; CHECK-NEXT: Successor(s): for.body, ir-bb<for.cond.cleanup.loopexit> +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: LV: Loop does not require scalar epilogue +; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF -; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count -; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: vp<%3> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<for.body.preheader>: ; CHECK-NEXT: IR %0 = zext i32 %n to i64 -; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64) +; CHECK-NEXT: EMIT vp<%3> = EXPAND SCEV (zext i32 %n to i64) ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + vp<[[VEC_TC]]> * ir<-1> -; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1> +; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%0> + vp<%2> * ir<-1> +; CHECK-NEXT: vp<%5> = DERIVED-IV ir<%n> + vp<%2> * ir<-1> ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: <x1> vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION -; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1> +; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1> +; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0> +; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%8>, ir<-1> ; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, vp<[[VF]]> -; CHECK-NEXT: WIDEN ir<%1> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0> +; CHECK-NEXT: WIDEN ir<%1> = load vp<%9> ; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00> ; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, vp<[[VF]]> -; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv1> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0> +; CHECK-NEXT: WIDEN store vp<%10>, ir<%conv1> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]> -; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>: ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: -; CHECK-NEXT: EMIT-SCALAR vp<[[RESUME1:%.+]]> = phi [ vp<[[END1]]>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ] -; CHECK-NEXT: EMIT-SCALAR vp<[[RESUME2:%.+]]>.1 = phi [ vp<[[END2]]>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%5>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ] ; CHECK-NEXT: Successor(s): ir-bb<for.body> ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<for.body>: -; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from scalar.ph) -; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph) -; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 +; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from scalar.ph) +; CHECK-NEXT: IR %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: IR %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: IR %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom +; CHECK-NEXT: IR %1 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: IR %conv1 = fadd float %1, 1.000000e+00 +; CHECK-NEXT: IR %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom +; CHECK-NEXT: IR store float %conv1, ptr %arrayidx3, align 4 +; CHECK-NEXT: IR %cmp = icmp ugt i64 %indvars.iv, 1 +; CHECK-NEXT: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] @@ -399,85 +594,211 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Interleaving is not beneficial. ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in <stdin> ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop -; CHECK: Executing best plan with VF=vscale x 4, UF=1 +; CHECK-NEXT: LV: Loop does not require scalar epilogue +; CHECK-NEXT: LV: Loop does not require scalar epilogue +; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 ; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' { -; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF -; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count -; CHECK-NEXT: ir<%0> = original trip-count +; CHECK-NEXT: Live-in ir<%18> = VF +; CHECK-NEXT: Live-in ir<%18>.1 = VF * UF +; CHECK-NEXT: Live-in ir<%n.vec> = vector-trip-count +; CHECK-NEXT: Live-in ir<%0> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<for.body.preheader>: ; CHECK-NEXT: IR %0 = zext i32 %n to i64 ; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck> ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<vector.scevcheck>: -; CHECK-NEXT: IR %3 = add nsw i64 %0, -1 -; CHECK-NEXT: IR %4 = add i32 %n, -1 -; CHECK-NEXT: IR %5 = trunc i64 %3 to i32 -; CHECK-NEXT: IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5) -; CHECK-NEXT: IR %mul.result = extractvalue { i32, i1 } %mul, 0 -; CHECK-NEXT: IR %mul.overflow = extractvalue { i32, i1 } %mul, 1 -; CHECK-NEXT: IR %6 = sub i32 %4, %mul.result -; CHECK-NEXT: IR %7 = icmp ugt i32 %6, %4 -; CHECK-NEXT: IR %8 = or i1 %7, %mul.overflow -; CHECK-NEXT: IR %9 = icmp ugt i64 %3, 4294967295 -; CHECK-NEXT: IR %10 = or i1 %8, %9 +; CHECK-NEXT: IR %3 = add nsw i64 %0, -1 +; CHECK-NEXT: IR %4 = add i32 %n, -1 +; CHECK-NEXT: IR %5 = trunc i64 %3 to i32 +; CHECK-NEXT: IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5) +; CHECK-NEXT: IR %mul.result = extractvalue { i32, i1 } %mul, 0 +; CHECK-NEXT: IR %mul.overflow = extractvalue { i32, i1 } %mul, 1 +; CHECK-NEXT: IR %6 = sub i32 %4, %mul.result +; CHECK-NEXT: IR %7 = icmp ugt i32 %6, %4 +; CHECK-NEXT: IR %8 = or i1 %7, %mul.overflow +; CHECK-NEXT: IR %9 = icmp ugt i64 %3, 4294967295 +; CHECK-NEXT: IR %10 = or i1 %8, %9 ; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck> ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<vector.memcheck>: -; CHECK-NEXT: IR %11 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: IR %12 = mul i64 %11, 4 -; CHECK-NEXT: IR %13 = mul i64 %12, 4 -; CHECK-NEXT: IR %14 = sub i64 %B1, %A2 -; CHECK-NEXT: IR %diff.check = icmp ult i64 %14, %13 +; CHECK-NEXT: IR %11 = call i64 @llvm.vscale.i64() +; CHECK-NEXT: IR %12 = mul i64 %11, 4 +; CHECK-NEXT: IR %13 = mul i64 %12, 4 +; CHECK-NEXT: IR %14 = sub i64 %B1, %A2 +; CHECK-NEXT: IR %diff.check = icmp ult i64 %14, %13 ; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph> ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<vector.ph>: -; CHECK-NEXT: IR %15 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: IR %16 = mul i64 %15, 4 -; CHECK-NEXT: IR %n.mod.vf = urem i64 %0, %16 -; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf -; CHECK-NEXT: IR %17 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: IR %18 = mul i64 %17, 4 -; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1> -; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1> +; CHECK-NEXT: IR %15 = call i64 @llvm.vscale.i64() +; CHECK-NEXT: IR %16 = mul i64 %15, 4 +; CHECK-NEXT: IR %n.mod.vf = urem i64 %0, %16 +; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf +; CHECK-NEXT: IR %17 = call i64 @llvm.vscale.i64() +; CHECK-NEXT: IR %18 = mul i64 %17, 4 +; CHECK-NEXT: vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1> +; CHECK-NEXT: vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1> ; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] -; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1> +; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%3>, ir<-1> ; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]> -; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<[[L]]>, ir<1.000000e+00> +; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18> +; CHECK-NEXT: WIDEN ir<%19> = load vp<%4> +; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%19>, ir<1.000000e+00> ; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]> -; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%conv1> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1 -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> +; CHECK-NEXT: vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18> +; CHECK-NEXT: WIDEN store vp<%5>, ir<%conv1> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1 +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec> ; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%0>, ir<[[VEC_TC]]> -; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%0>, ir<%n.vec> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph> ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>: ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<scalar.ph>: -; CHECK-NEXT: EMIT-SCALAR vp<[[RESUME1:%.+]]> = phi [ vp<[[END1]]>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ] -; CHECK-NEXT: EMIT-SCALAR vp<[[RESUME2:%.+]]>.1 = phi [ vp<[[END2]]>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%1>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%2>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ] ; CHECK-NEXT: Successor(s): ir-bb<for.body> ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<for.body>: -; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from ir-bb<scalar.ph>) -; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from ir-bb<scalar.ph>) -; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 +; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>) +; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from ir-bb<scalar.ph>) +; CHECK-NEXT: IR %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: IR %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: IR %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom +; CHECK-NEXT: IR %19 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: IR %conv1 = fadd float %19, 1.000000e+00 +; CHECK-NEXT: IR %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom +; CHECK-NEXT: IR store float %conv1, ptr %arrayidx3, align 4 +; CHECK-NEXT: IR %cmp = icmp ugt i64 %indvars.iv, 1 +; CHECK-NEXT: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: No successors ; CHECK-NEXT: } +; CHECK-NEXT: LV: vectorizing VPBB:ir-bb<for.body.preheader> in BB:for.body.preheader +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: for.body.preheader: ; preds = %entry +; CHECK-NEXT: %0 = zext i32 %n to i64 +; CHECK-NEXT: %1 = call i64 @llvm.vscale.i64() +; CHECK-NEXT: %2 = mul i64 %1, 4 +; CHECK-NEXT: %min.iters.check = icmp ult i64 %0, %2 +; CHECK-NEXT: br i1 %min.iters.check, label %scalar.ph, label %vector.scevcheck +; CHECK-NEXT: LV: vectorizing VPBB:ir-bb<vector.scevcheck> in BB:vector.scevcheck +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: vector.scevcheck: ; preds = %for.body.preheader +; CHECK-NEXT: %3 = add nsw i64 %0, -1 +; CHECK-NEXT: %4 = add i32 %n, -1 +; CHECK-NEXT: %5 = trunc i64 %3 to i32 +; CHECK-NEXT: %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5) +; CHECK-NEXT: %mul.result = extractvalue { i32, i1 } %mul, 0 +; CHECK-NEXT: %mul.overflow = extractvalue { i32, i1 } %mul, 1 +; CHECK-NEXT: %6 = sub i32 %4, %mul.result +; CHECK-NEXT: %7 = icmp ugt i32 %6, %4 +; CHECK-NEXT: %8 = or i1 %7, %mul.overflow +; CHECK-NEXT: %9 = icmp ugt i64 %3, 4294967295 +; CHECK-NEXT: %10 = or i1 %8, %9 +; CHECK-NEXT: br i1 %10, label %scalar.ph, label %vector.memcheck +; CHECK-NEXT: LV: draw edge fromfor.body.preheader +; CHECK-NEXT: LV: vectorizing VPBB:ir-bb<vector.memcheck> in BB:vector.memcheck +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: vector.memcheck: ; preds = %vector.scevcheck +; CHECK-NEXT: %11 = call i64 @llvm.vscale.i64() +; CHECK-NEXT: %12 = mul i64 %11, 4 +; CHECK-NEXT: %13 = mul i64 %12, 4 +; CHECK-NEXT: %14 = sub i64 %B1, %A2 +; CHECK-NEXT: %diff.check = icmp ult i64 %14, %13 +; CHECK-NEXT: br i1 %diff.check, label %scalar.ph, label %vector.ph +; CHECK-NEXT: LV: draw edge fromvector.scevcheck +; CHECK-NEXT: LV: vectorizing VPBB:ir-bb<vector.ph> in BB:vector.ph +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: vector.ph: ; preds = %vector.memcheck +; CHECK-NEXT: %15 = call i64 @llvm.vscale.i64() +; CHECK-NEXT: %16 = mul i64 %15, 4 +; CHECK-NEXT: %n.mod.vf = urem i64 %0, %16 +; CHECK-NEXT: %n.vec = sub i64 %0, %n.mod.vf +; CHECK-NEXT: %17 = call i64 @llvm.vscale.i64() +; CHECK-NEXT: %18 = mul i64 %17, 4 +; CHECK-NEXT: %19 = sub i64 %0, %n.vec +; CHECK-NEXT: %.cast = trunc i64 %n.vec to i32 +; CHECK-NEXT: %20 = sub i32 %n, %.cast +; CHECK-NEXT: br +; CHECK-NEXT: LV: draw edge fromvector.memcheck +; CHECK-NEXT: LV: created vector.body +; CHECK-NEXT: LV: draw edge fromvector.ph +; CHECK-NEXT: LV: vectorizing VPBB:vector.body in BB:vector.body +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: vector.body: ; preds = %vector.body, %vector.ph +; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ] +; CHECK-NEXT: %.cast3 = trunc i64 %index to i32 +; CHECK-NEXT: %offset.idx = sub i32 %n, %.cast3 +; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1 +; CHECK-NEXT: %22 = zext i32 %21 to i64 +; CHECK-NEXT: %23 = getelementptr inbounds float, ptr %B, i64 %22 +; CHECK-NEXT: %24 = mul i64 0, %18 +; CHECK-NEXT: %25 = sub i64 1, %18 +; CHECK-NEXT: %26 = getelementptr inbounds float, ptr %23, i64 %24 +; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %26, i64 %25 +; CHECK-NEXT: %wide.load = load <vscale x 4 x float>, ptr %27, align 4 +; CHECK-NEXT: %reverse = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %wide.load) +; CHECK-NEXT: %28 = fadd <vscale x 4 x float> %reverse, splat (float 1.000000e+00) +; CHECK-NEXT: %29 = getelementptr inbounds float, ptr %A, i64 %22 +; CHECK-NEXT: %30 = mul i64 0, %18 +; CHECK-NEXT: %31 = sub i64 1, %18 +; CHECK-NEXT: %32 = getelementptr inbounds float, ptr %29, i64 %30 +; CHECK-NEXT: %33 = getelementptr inbounds float, ptr %32, i64 %31 +; CHECK-NEXT: %reverse4 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %28) +; CHECK-NEXT: store <vscale x 4 x float> %reverse4, ptr %33, align 4 +; CHECK-NEXT: %index.next = add nuw i64 %index, %18 +; CHECK-NEXT: %34 = icmp eq i64 %index.next, %n.vec +; CHECK-NEXT: br i1 %34, <null operand!>, label %vector.body +; CHECK-NEXT: LV: created middle.block +; CHECK-NEXT: LV: draw edge fromvector.body +; CHECK-NEXT: LV: vectorizing VPBB:middle.block in BB:middle.block +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: middle.block: ; preds = %vector.body +; CHECK-NEXT: %cmp.n = icmp eq i64 %0, %n.vec +; CHECK-NEXT: br i1 %cmp.n, <null operand!>, <null operand!> +; CHECK-NEXT: LV: vectorizing VPBB:ir-bb<for.cond.cleanup.loopexit> in BB:for.cond.cleanup.loopexit +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: for.cond.cleanup.loopexit: ; preds = %for.body +; CHECK-NEXT: br label %for.cond.cleanup +; CHECK-NEXT: LV: draw edge frommiddle.block +; CHECK-NEXT: LV: vectorizing VPBB:ir-bb<scalar.ph> in BB:scalar.ph +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader +; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ] +; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] +; CHECK-NEXT: br label %for.body +; CHECK-NEXT: LV: draw edge frommiddle.block +; CHECK-NEXT: LV: draw edge fromfor.body.preheader +; CHECK-NEXT: LV: draw edge fromvector.scevcheck +; CHECK-NEXT: LV: draw edge fromvector.memcheck +; CHECK-NEXT: LV: vectorizing VPBB:ir-bb<for.body> in BB:for.body +; CHECK-NEXT: LV: filled BB: +; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph +; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] +; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ] +; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom +; CHECK-NEXT: %35 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: %conv1 = fadd float %35, 1.000000e+00 +; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom +; CHECK-NEXT: store float %conv1, ptr %arrayidx3, align 4 +; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1 +; CHECK-NEXT: %indvars.iv.next = add nsw i64 %indvars.iv, -1 +; CHECK-NEXT: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 +; CHECK-NEXT: LV: draw edge fromscalar.ph +; CHECK-NEXT: LV: Interleaving disabled by the pass manager +; CHECK-NEXT: LV: Vectorizing: innermost loop. ; entry: %cmp7 = icmp sgt i32 %n, 0 |