aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/Transforms')
-rw-r--r--llvm/test/Transforms/GVN/PRE/pre-load.ll48
-rw-r--r--llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll16
-rw-r--r--llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll738
-rw-r--r--llvm/test/Transforms/Inline/ML/state-accounting-skip-non-cold.ll55
-rw-r--r--llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll4
-rw-r--r--llvm/test/Transforms/InstCombine/ptr-int-cast.ll11
-rw-r--r--llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll230
-rw-r--r--llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll6
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll13
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll18
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll19
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll30
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll7
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll10
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll5
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll10
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll10
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll40
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll10
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll32
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/avx1.ll12
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll6
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll22
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/cost-model.ll10
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll8
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll22
-rw-r--r--llvm/test/Transforms/LoopVectorize/assume.ll20
-rw-r--r--llvm/test/Transforms/LoopVectorize/bsd_regex.ll6
-rw-r--r--llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll12
-rw-r--r--llvm/test/Transforms/LoopVectorize/if-conversion.ll10
-rw-r--r--llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll12
-rw-r--r--llvm/test/Transforms/LoopVectorize/memdep.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll9
-rw-r--r--llvm/test/Transforms/LoopVectorize/operand-bundles.ll227
-rw-r--r--llvm/test/Transforms/LoopVectorize/partial-lcssa.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/pr28541.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/runtime-check.ll16
-rw-r--r--llvm/test/Transforms/LoopVectorize/scalable-assume.ll6
-rw-r--r--llvm/test/Transforms/LoopVectorize/write-only.ll12
-rw-r--r--llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll2
42 files changed, 1548 insertions, 196 deletions
diff --git a/llvm/test/Transforms/GVN/PRE/pre-load.ll b/llvm/test/Transforms/GVN/PRE/pre-load.ll
index 5a07f9f..afa1354 100644
--- a/llvm/test/Transforms/GVN/PRE/pre-load.ll
+++ b/llvm/test/Transforms/GVN/PRE/pre-load.ll
@@ -1503,3 +1503,51 @@ wrong:
exit:
ret void
}
+
+; Allow the load to be made available on the edge (%entry, %if.end) as part of PRE,
+; but ensure `%identical.l` is not hoisted to its predecessor due to the local
+; dependency with the call.
+
+define i32 @test24(ptr noalias %p, ptr noalias %q, i1 %c) {
+; MDEP-LABEL: @test24(
+; MDEP-NEXT: entry:
+; MDEP-NEXT: br i1 [[C:%.*]], label [[ENTRY_IF_END_CRIT_EDGE:%.*]], label [[IF_THEN:%.*]]
+; MDEP: entry.if.end_crit_edge:
+; MDEP-NEXT: [[VV_PRE:%.*]] = load i32, ptr [[X:%.*]], align 4
+; MDEP-NEXT: br label [[IF_END:%.*]]
+; MDEP: if.then:
+; MDEP-NEXT: call void @opaque(ptr [[X]])
+; MDEP-NEXT: [[UU:%.*]] = load i32, ptr [[X]], align 4
+; MDEP-NEXT: store i32 [[UU]], ptr [[R:%.*]], align 4
+; MDEP-NEXT: br label [[IF_END]]
+; MDEP: if.end:
+; MDEP-NEXT: [[VV:%.*]] = phi i32 [ [[VV_PRE]], [[ENTRY_IF_END_CRIT_EDGE]] ], [ [[UU]], [[IF_THEN]] ]
+; MDEP-NEXT: ret i32 [[VV]]
+;
+; MSSA-LABEL: @test24(
+; MSSA-NEXT: entry:
+; MSSA-NEXT: br i1 [[C:%.*]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; MSSA: if.then:
+; MSSA-NEXT: call void @opaque(ptr [[X:%.*]])
+; MSSA-NEXT: [[UU:%.*]] = load i32, ptr [[X]], align 4
+; MSSA-NEXT: store i32 [[UU]], ptr [[R:%.*]], align 4
+; MSSA-NEXT: br label [[IF_END]]
+; MSSA: if.end:
+; MSSA-NEXT: [[VV:%.*]] = load i32, ptr [[X]], align 4
+; MSSA-NEXT: ret i32 [[VV]]
+;
+entry:
+ br i1 %c, label %if.end, label %if.then
+
+if.then:
+ call void @opaque(ptr %p)
+ %identical.l = load i32, ptr %p, align 4
+ store i32 %identical.l, ptr %q, align 4
+ br label %if.end
+
+if.end:
+ %l = load i32, ptr %p, align 4
+ ret i32 %l
+}
+
+declare void @opaque(ptr) nounwind willreturn
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll b/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll
index cb4e07e..9b9bc68 100644
--- a/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll
@@ -60,8 +60,7 @@ define void @f_sadd_overflow(ptr %a) {
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 2147483645, %[[ENTRY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 2147483647
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK-NEXT: br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
; CHECK: [[TRAP]]:
; CHECK-NEXT: tail call void @llvm.trap(), !nosanitize [[META0]]
; CHECK-NEXT: unreachable, !nosanitize [[META0]]
@@ -150,8 +149,7 @@ define void @f_uadd_overflow(ptr %a) {
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ -6, %[[ENTRY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], -1
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK-NEXT: br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
; CHECK: [[TRAP]]:
; CHECK-NEXT: tail call void @llvm.trap(), !nosanitize [[META0]]
; CHECK-NEXT: unreachable, !nosanitize [[META0]]
@@ -243,10 +241,7 @@ define void @f_ssub_overflow(ptr nocapture %a) {
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ -2147483642, %[[ENTRY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[TMP0:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[TMP1:%.*]] = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[TMP0]], i32 1)
-; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT: br i1 [[TMP2]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK-NEXT: br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
; CHECK: [[TRAP]]:
; CHECK-NEXT: tail call void @llvm.trap(), !nosanitize [[META0]]
; CHECK-NEXT: unreachable, !nosanitize [[META0]]
@@ -339,10 +334,7 @@ define void @f_usub_overflow(ptr nocapture %a) {
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 15, %[[ENTRY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[TMP1:%.*]] = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[TMP0]], i32 1)
-; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT: br i1 [[TMP2]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK-NEXT: br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
; CHECK: [[TRAP]]:
; CHECK-NEXT: tail call void @llvm.trap(), !nosanitize [[META0]]
; CHECK-NEXT: unreachable, !nosanitize [[META0]]
diff --git a/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll b/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll
new file mode 100644
index 0000000..b9c9228
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll
@@ -0,0 +1,738 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=indvars < %s | FileCheck %s
+
+define void @optimize_trap(i32 %block_size) {
+; CHECK-LABEL: define void @optimize_trap(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[BLOCK_SIZE]], -1
+; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 3)
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 3, [[UMIN]]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: br i1 [[TMP2]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @llvm.trap()
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @llvm.trap()
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_atomic(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_atomic(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @llvm.trap()
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store atomic i8 [[TMP4]], ptr [[ARRAYIDX7]] unordered, align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @llvm.trap()
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store atomic i8 %1, ptr %arrayidx7 unordered, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_volatile(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_volatile(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @llvm.trap()
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store volatile i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @llvm.trap()
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store volatile i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_call(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_call(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @llvm.trap()
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: call void @x(ptr null)
+; CHECK-NEXT: store volatile i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @llvm.trap()
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ call void @x(ptr null)
+ store volatile i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @optimize_ubsan_trap(i32 %block_size) {
+; CHECK-LABEL: define void @optimize_ubsan_trap(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[BLOCK_SIZE]], -1
+; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 3)
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 3, [[UMIN]]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: br i1 [[TMP2]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @llvm.ubsantrap(i8 1)
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @llvm.ubsantrap(i8 1)
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_arbitrary_call(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_arbitrary_call(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @noreturn_with_argmem(ptr [[FOO_ARR]])
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @noreturn_with_argmem(ptr %foo_arr)
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_two_exits(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_two_exits(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[P:%.*]] = call i1 @pred()
+; CHECK-NEXT: br i1 [[P]], label %[[FOR_BODY_CONT:.*]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK: [[FOR_BODY_CONT]]:
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @noreturn(ptr [[FOO_ARR]])
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[TMP0]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body:
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %p = call i1 @pred()
+ br i1 %p, label %for.body.cont, label %for.cond.cleanup.loopexit
+
+for.body.cont: ; preds = %for.body.preheader, %if.end4
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @noreturn(ptr %foo_arr)
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_two_exits2(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_two_exits2(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_BODY_CONT:.*]]
+; CHECK: [[FOR_BODY_CONT]]:
+; CHECK-NEXT: [[P:%.*]] = call i1 @pred()
+; CHECK-NEXT: br i1 [[P]], label %[[IF_END4]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @noreturn(ptr [[FOO_ARR]])
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[TMP0]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body:
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %for.body.cont
+
+for.body.cont: ; preds = %for.body.preheader, %if.end4
+ %p = call i1 @pred()
+ br i1 %p, label %if.end4, label %for.cond.cleanup.loopexit
+
+if.then: ; preds = %for.body
+ call void @noreturn(ptr %foo_arr)
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_depdendent_ubsan_trap(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_depdendent_ubsan_trap(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: [[I_015_LCSSA:%.*]] = phi i32 [ [[I_015]], %[[FOR_BODY]] ]
+; CHECK-NEXT: call void @noreturn_with_i32(i32 [[I_015_LCSSA]])
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[TMP0]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @noreturn_with_i32(i32 %i.015)
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_depdendent_load_trap(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_depdendent_load_trap(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: [[I_015_LCSSA:%.*]] = load i8, ptr [[FOO_ARR]], align 1
+; CHECK-NEXT: call void @noreturn_with_i8(i8 [[I_015_LCSSA]])
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[TMP0]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ %r = load i8, ptr %foo_arr, align 1
+ call void @noreturn_with_i8(i8 %r)
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+
+declare void @x(ptr noundef) local_unnamed_addr
+declare i1 @pred() local_unnamed_addr
+
+declare void @llvm.trap() #0
+declare void @noreturn(ptr) #0
+declare void @noreturn_with_i32(i32) #0
+declare void @noreturn_with_i8(i8) #0
+declare void @noreturn_with_argmem(ptr) #1
+
+attributes #0 = { cold noreturn nounwind memory(inaccessiblemem: write) }
+attributes #1 = { cold noreturn nounwind memory(argmem: read) }
diff --git a/llvm/test/Transforms/Inline/ML/state-accounting-skip-non-cold.ll b/llvm/test/Transforms/Inline/ML/state-accounting-skip-non-cold.ll
new file mode 100644
index 0000000..0887f5e
--- /dev/null
+++ b/llvm/test/Transforms/Inline/ML/state-accounting-skip-non-cold.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; REQUIRES: llvm_inliner_model_autogenerated && asserts
+; RUN: opt -passes='default<O3>' -enable-ml-inliner=release -ml-inliner-skip-policy=if-caller-not-cold -S %s -o - | FileCheck %s
+; RUN: opt -passes='default<O3>' -ml-inliner-stop-immediately -enable-ml-inliner=release -ml-inliner-skip-policy=if-caller-not-cold -S %s -o - | FileCheck %s
+
+declare ptr @f()
+
+define void @e() #0 {
+; CHECK-LABEL: define void @e(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: tail call void @d()
+; CHECK-NEXT: tail call void @g()
+; CHECK-NEXT: tail call void @d()
+; CHECK-NEXT: tail call void @g()
+; CHECK-NEXT: tail call void @d()
+; CHECK-NEXT: tail call void @g()
+; CHECK-NEXT: ret void
+;
+ call void @h()
+ call void @h()
+ call void @h()
+ ret void
+}
+
+define void @d() {
+; CHECK-LABEL: define void @d() local_unnamed_addr {
+; CHECK-NEXT: tail call void @f()
+; CHECK-NEXT: ret void
+;
+ call void @f()
+ ret void
+}
+
+define void @g() {
+; CHECK-LABEL: define void @g() local_unnamed_addr {
+; CHECK-NEXT: tail call void @f()
+; CHECK-NEXT: ret void
+;
+ call void @f()
+ ret void
+}
+
+define void @h() #0 {
+; CHECK-LABEL: define void @h(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: tail call void @d()
+; CHECK-NEXT: tail call void @g()
+; CHECK-NEXT: ret void
+;
+ call void @d()
+ call void @g()
+ ret void
+}
+
+attributes #0 = { "sign-return-address"="non-leaf" "sign-return-address-key"="a_key" }
diff --git a/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll b/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll
index 7cc4446..ad45d1e 100644
--- a/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll
+++ b/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll
@@ -11,10 +11,8 @@ define i16 @test5(i16 %A) !dbg !34 {
call void @llvm.dbg.value(metadata i32 %C, metadata !37, metadata !DIExpression()), !dbg !41
; Preserve the dbg.value for the DCE'd 32-bit 'and'.
- ;
- ; The high 16 bits of the original 'and' require sign-extending the new 16-bit and:
; CHECK-NEXT: #dbg_value(i16 [[and]], [[C:![0-9]+]],
- ; CHECK-SAME: !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_signed, DW_OP_LLVM_convert, 32, DW_ATE_signed, DW_OP_stack_value)
+ ; CHECK-SAME: !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_stack_value)
%D = trunc i32 %C to i16, !dbg !42
call void @llvm.dbg.value(metadata i16 %D, metadata !38, metadata !DIExpression()), !dbg !42
diff --git a/llvm/test/Transforms/InstCombine/ptr-int-cast.ll b/llvm/test/Transforms/InstCombine/ptr-int-cast.ll
index 69b8f69..82ecbd4 100644
--- a/llvm/test/Transforms/InstCombine/ptr-int-cast.ll
+++ b/llvm/test/Transforms/InstCombine/ptr-int-cast.ll
@@ -86,3 +86,14 @@ define <4 x ptr> @test7(<4 x i128> %arg) nounwind {
%p1 = inttoptr <4 x i128> %arg to <4 x ptr>
ret <4 x ptr> %p1
}
+
+define i64 @ptrtoint_gep_sub(ptr %ptr, i64 %end.addr) {
+; CHECK-LABEL: @ptrtoint_gep_sub(
+; CHECK-NEXT: ret i64 [[END_ADDR:%.*]]
+;
+ %ptr.addr = ptrtoint ptr %ptr to i64
+ %size = sub i64 %end.addr, %ptr.addr
+ %end = getelementptr i8, ptr %ptr, i64 %size
+ %end.addr2 = ptrtoint ptr %end to i64
+ ret i64 %end.addr2
+}
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
index ed9fba3..22ab79d 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
@@ -289,6 +289,225 @@ define void @interleave_wide_nxdouble_factor2(ptr %ptr, <vscale x 4 x double> %l
ret void
}
+define void @deinterleave1_nxi64_factor3(ptr %ptr, <vscale x 4 x i64>* %s1, <vscale x 4 x i64>* %s2, <vscale x 4 x i64>* %s3) #0 {
+; CHECK-LABEL: define void @deinterleave1_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0
+; CHECK-NEXT: [[LDN1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP1]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 2
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3
+; CHECK-NEXT: [[LDN2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP8]])
+; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP9]], i64 2)
+; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 1
+; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP11]], i64 2)
+; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 2
+; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP13]], i64 2)
+; CHECK-NEXT: [[TMP15:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } poison, <vscale x 4 x i64> [[TMP10]], 0
+; CHECK-NEXT: [[TMP16:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP15]], <vscale x 4 x i64> [[TMP12]], 1
+; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP16]], <vscale x 4 x i64> [[TMP14]], 2
+; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 0
+; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 1
+; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 2
+; CHECK-NEXT: store <vscale x 4 x i64> [[TMP18]], ptr [[S1]], align 32
+; CHECK-NEXT: store <vscale x 4 x i64> [[TMP19]], ptr [[S2]], align 32
+; CHECK-NEXT: store <vscale x 4 x i64> [[TMP20]], ptr [[S3]], align 32
+; CHECK-NEXT: ret void
+;
+ %wide.vec = load <vscale x 12 x i64>, ptr %ptr, align 8
+ %ldN = tail call { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave3.nxv12i64(<vscale x 12 x i64> %wide.vec)
+
+ %3 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 0
+ %4 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 1
+ %5 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 2
+
+ store <vscale x 4 x i64> %3, <vscale x 4 x i64>* %s1
+ store <vscale x 4 x i64> %4, <vscale x 4 x i64>* %s2
+ store <vscale x 4 x i64> %5, <vscale x 4 x i64>* %s3
+ ret void
+}
+
+define void @deinterleave2_nxi64_factor3(ptr %ptr, <vscale x 8 x i64>* %s1, <vscale x 8 x i64>* %s2, <vscale x 8 x i64>* %s3) #0 {
+; CHECK-LABEL: define void @deinterleave2_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0
+; CHECK-NEXT: [[LDN1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP1]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 2
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3
+; CHECK-NEXT: [[LDN2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP8]])
+; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP9]], i64 2)
+; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 1
+; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP11]], i64 2)
+; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 2
+; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP13]], i64 2)
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 6
+; CHECK-NEXT: [[LDN3:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP15]])
+; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 0
+; CHECK-NEXT: [[TMP17:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP16]], i64 4)
+; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 1
+; CHECK-NEXT: [[TMP19:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP18]], i64 4)
+; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 2
+; CHECK-NEXT: [[TMP21:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP20]], i64 4)
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 9
+; CHECK-NEXT: [[LDN4:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP22]])
+; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 0
+; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP17]], <vscale x 2 x i64> [[TMP23]], i64 6)
+; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 1
+; CHECK-NEXT: [[TMP26:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP19]], <vscale x 2 x i64> [[TMP25]], i64 6)
+; CHECK-NEXT: [[TMP27:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 2
+; CHECK-NEXT: [[TMP28:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP21]], <vscale x 2 x i64> [[TMP27]], i64 6)
+; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } poison, <vscale x 8 x i64> [[TMP24]], 0
+; CHECK-NEXT: [[TMP30:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP29]], <vscale x 8 x i64> [[TMP26]], 1
+; CHECK-NEXT: [[TMP31:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP30]], <vscale x 8 x i64> [[TMP28]], 2
+; CHECK-NEXT: [[TMP32:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 0
+; CHECK-NEXT: [[TMP33:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 1
+; CHECK-NEXT: [[TMP34:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 2
+; CHECK-NEXT: store <vscale x 8 x i64> [[TMP32]], ptr [[S1]], align 64
+; CHECK-NEXT: store <vscale x 8 x i64> [[TMP33]], ptr [[S2]], align 64
+; CHECK-NEXT: store <vscale x 8 x i64> [[TMP34]], ptr [[S3]], align 64
+; CHECK-NEXT: ret void
+;
+ %wide.vec = load <vscale x 24 x i64>, ptr %ptr, align 8
+ %ldN = tail call { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } @llvm.vector.deinterleave3.nxv24i64(<vscale x 24 x i64> %wide.vec)
+
+ %3 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 0
+ %4 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 1
+ %5 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 2
+
+ store <vscale x 8 x i64> %3, <vscale x 8 x i64>* %s1
+ store <vscale x 8 x i64> %4, <vscale x 8 x i64>* %s2
+ store <vscale x 8 x i64> %5, <vscale x 8 x i64>* %s3
+ ret void
+}
+
+define void @deinterleave_neg1_nxi64_factor3(ptr %ptr, <vscale x 1 x i64>* %s1, <vscale x 1 x i64>* %s2, <vscale x 1 x i64>* %s3) #0 {
+; CHECK-LABEL: define void @deinterleave_neg1_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 3 x i64>, ptr [[PTR]], align 8
+; CHECK-NEXT: [[LDN:%.*]] = tail call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 2
+; CHECK-NEXT: store <vscale x 1 x i64> [[TMP1]], ptr [[S1]], align 8
+; CHECK-NEXT: store <vscale x 1 x i64> [[TMP2]], ptr [[S2]], align 8
+; CHECK-NEXT: store <vscale x 1 x i64> [[TMP3]], ptr [[S3]], align 8
+; CHECK-NEXT: ret void
+;
+ %wide.vec = load <vscale x 3 x i64>, ptr %ptr, align 8
+ %ldN = tail call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64> %wide.vec)
+
+ %3 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 0
+ %4 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 1
+ %5 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 2
+
+ store <vscale x 1 x i64> %3, <vscale x 1 x i64>* %s1
+ store <vscale x 1 x i64> %4, <vscale x 1 x i64>* %s2
+ store <vscale x 1 x i64> %5, <vscale x 1 x i64>* %s3
+ ret void
+}
+
+define void @deinterleave_neg2_nxi8_factor3(ptr %ptr, <vscale x 8 x i8>* %s1, <vscale x 8 x i8>* %s2, <vscale x 8 x i8>* %s3) #0 {
+; CHECK-LABEL: define void @deinterleave_neg2_nxi8_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 24 x i8>, ptr [[PTR]], align 8
+; CHECK-NEXT: [[LDN:%.*]] = tail call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv24i8(<vscale x 24 x i8> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 2
+; CHECK-NEXT: store <vscale x 8 x i8> [[TMP1]], ptr [[S1]], align 8
+; CHECK-NEXT: store <vscale x 8 x i8> [[TMP2]], ptr [[S2]], align 8
+; CHECK-NEXT: store <vscale x 8 x i8> [[TMP3]], ptr [[S3]], align 8
+; CHECK-NEXT: ret void
+;
+ %wide.vec = load <vscale x 24 x i8>, ptr %ptr, align 8
+ %ldN = tail call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv12i8(<vscale x 24 x i8> %wide.vec)
+
+ %3 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 0
+ %4 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 1
+ %5 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 2
+
+ store <vscale x 8 x i8> %3, <vscale x 8 x i8>* %s1
+ store <vscale x 8 x i8> %4, <vscale x 8 x i8>* %s2
+ store <vscale x 8 x i8> %5, <vscale x 8 x i8>* %s3
+ ret void
+}
+
+define void @interleave1_nxi64_factor3(ptr %ptr, <vscale x 8 x i64> %s1, <vscale x 8 x i64> %s2, <vscale x 8 x i64> %s3) #0 {
+; CHECK-LABEL: define void @interleave1_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 8 x i64> [[S1:%.*]], <vscale x 8 x i64> [[S2:%.*]], <vscale x 8 x i64> [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 0)
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 0)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP1]])
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3
+; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 2)
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 2)
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 2)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP5]])
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 6
+; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 4)
+; CHECK-NEXT: [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 4)
+; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 4)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP9]])
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 9
+; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 6)
+; CHECK-NEXT: [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 6)
+; CHECK-NEXT: [[TMP16:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 6)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP13]])
+; CHECK-NEXT: ret void
+;
+ %interleave = tail call <vscale x 24 x i64> @llvm.vector.interleave3.nxv24i64(<vscale x 8 x i64> %s1, <vscale x 8 x i64> %s2, <vscale x 8 x i64> %s3)
+
+ store <vscale x 24 x i64> %interleave, ptr %ptr, align 4
+ ret void
+}
+
+define void @interleave2_nxi64_factor3(ptr %ptr, <vscale x 4 x i64> %s1, <vscale x 4 x i64> %s2, <vscale x 4 x i64> %s3) #0 {
+; CHECK-LABEL: define void @interleave2_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 4 x i64> [[S1:%.*]], <vscale x 4 x i64> [[S2:%.*]], <vscale x 4 x i64> [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S1]], i64 0)
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S3]], i64 0)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP1]])
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3
+; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S1]], i64 2)
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S2]], i64 2)
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S3]], i64 2)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP5]])
+; CHECK-NEXT: ret void
+;
+ %interleave = tail call <vscale x 12 x i64> @llvm.vector.interleave3.nxv12i64(<vscale x 4 x i64> %s1, <vscale x 4 x i64> %s2, <vscale x 4 x i64> %s3)
+
+ store <vscale x 12 x i64> %interleave, ptr %ptr, align 4
+ ret void
+}
+
+define void @interleave_neg_nxi8_factor3(ptr %ptr, <vscale x 8 x i8> %s1, <vscale x 8 x i8> %s2, <vscale x 8 x i8> %s3) #0 {
+; CHECK-LABEL: define void @interleave_neg_nxi8_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 8 x i8> [[S1:%.*]], <vscale x 8 x i8> [[S2:%.*]], <vscale x 8 x i8> [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[INTERLEAVE:%.*]] = tail call <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8> [[S1]], <vscale x 8 x i8> [[S2]], <vscale x 8 x i8> [[S3]])
+; CHECK-NEXT: store <vscale x 24 x i8> [[INTERLEAVE]], ptr [[PTR]], align 4
+; CHECK-NEXT: ret void
+;
+ %interleave = tail call <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8> %s1, <vscale x 8 x i8> %s2, <vscale x 8 x i8> %s3)
+
+ store <vscale x 24 x i8> %interleave, ptr %ptr, align 4
+ ret void
+}
+
declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
@@ -312,4 +531,15 @@ declare <vscale x 4 x ptr> @llvm.vector.interleave2.nxv4p0(<vscale x 2 x ptr>, <
; Larger interleaves to test 'legalization'
declare <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
+; De-Interleaves with Factor=3
+declare { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64>)
+declare { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave3.nxv12i64(<vscale x 12 x i64>)
+declare { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } @llvm.vector.deinterleave3.nxv24i64(<vscale x 24 x i64>)
+declare { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv24i8(<vscale x 24 x i8>)
+
+; Interleaves with Factor=3
+declare <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>)
+declare <vscale x 24 x i64> @llvm.vector.interleave3.nxv24i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64>)
+declare <vscale x 12 x i64> @llvm.vector.interleave3.nxv12i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64>)
+
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
index 279d4e8..83623fd 100644
--- a/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
+++ b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
@@ -5,8 +5,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
;CHECK-LABEL: @foo(
;CHECK: icmp eq <4 x i32>
;CHECK: select <4 x i1>
-;CHECK: ret i32
-define i32 @foo(i32 %x, i32 %t, ptr nocapture %A) nounwind uwtable ssp {
+;CHECK: ret void
+define void @foo(i32 %x, i32 %t, ptr nocapture %A) nounwind uwtable ssp {
entry:
%cmp10 = icmp sgt i32 %x, 0
br i1 %cmp10, label %for.body, label %for.end
@@ -35,5 +35,5 @@ if.end: ; preds = %for.body, %if.then
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %if.end, %entry
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 6cf11be..6fe6883 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -660,16 +660,17 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]]
; COMMON: [[PRED_STORE_CONTINUE12]]:
-; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT1:.*]]
+; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
; COMMON: [[PRED_STORE_IF13]]:
; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1
-; COMMON-NEXT: br label %[[EXIT1]]
+; COMMON-NEXT: br label %[[EXIT]]
+; COMMON: [[EXIT]]:
+; COMMON-NEXT: br label %[[SCALAR_PH:.*]]
+; COMMON: [[SCALAR_PH]]:
+; COMMON-NEXT: br label %[[EXIT1:.*]]
; COMMON: [[EXIT1]]:
-; COMMON-NEXT: br label %[[SCALAR_PH1:.*]]
-; COMMON: [[SCALAR_PH1]]:
-; COMMON-NEXT: br [[EXIT:label %.*]]
-; COMMON: [[SCALAR_PH:.*:]]
+; COMMON-NEXT: ret void
;
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
index 93e71af..e3e4833 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
@@ -42,7 +42,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -80,7 +80,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP17]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]])
; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]]
@@ -104,7 +104,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17
-; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]]
@@ -167,13 +167,13 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
; CHECK-NEXT: [[TMP11]] = or <16 x i64> [[VEC_PHI]], [[TMP10]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]])
; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -211,7 +211,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP17]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]])
; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]]
@@ -235,7 +235,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17
-; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]]
@@ -308,7 +308,7 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
; CHECK-NEXT: store i64 0, ptr [[L]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[SCALAR_PH:.*]]
; CHECK: [[SCALAR_PH]]:
@@ -332,7 +332,7 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
; CHECK-NEXT: store i64 0, ptr [[L]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 2
; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV1]], 14
-; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index e424649..75b18ff 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -541,3 +541,22 @@ exit: ; preds = %for.body
; CHECK-VS1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK-VS1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK-VS1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VS1: [[PROF3]] = !{!"branch_weights", i32 8, i32 8}
+; CHECK-VS1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VS1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK-VS1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-VS1: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; CHECK-VS1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-VS1: [[PROF9]] = !{!"branch_weights", i32 10, i32 30}
+;.
+; CHECK-VS2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VS2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VS2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VS2: [[PROF3]] = !{!"branch_weights", i32 8, i32 8}
+; CHECK-VS2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VS2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK-VS2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-VS2: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; CHECK-VS2: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-VS2: [[PROF9]] = !{!"branch_weights", i32 10, i32 30}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
index a6e0f8a..300f5d9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
@@ -40,6 +40,7 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
; CHECK-ARMPL: [[ENTRY:.*:]]
; CHECK-ARMPL: [[VECTOR_PH:.*:]]
; CHECK-ARMPL: [[VECTOR_BODY:.*:]]
+; CHECK-ARMPL: [[VECTOR_BODY1:.*:]]
; CHECK-ARMPL: [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]])
; CHECK-ARMPL: [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]])
; CHECK-ARMPL: [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0
@@ -53,6 +54,15 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]]
; CHECK-ARMPL: [[SCALAR_PH:.*:]]
; CHECK-ARMPL: [[FOR_BODY:.*:]]
+; CHECK-ARMPL: [[VEC_EPILOG_VECTOR_BODY:.*:]]
+; CHECK-ARMPL: [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
+; CHECK-ARMPL: [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
+; CHECK-ARMPL: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
+; CHECK-ARMPL: store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
+; CHECK-ARMPL: store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
+; CHECK-ARMPL: [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
+; CHECK-ARMPL: [[VEC_EPILOG_SCALAR_PH:.*:]]
+; CHECK-ARMPL: [[FOR_BODY1:.*:]]
; CHECK-ARMPL: [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]])
; CHECK-ARMPL: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
; CHECK-ARMPL: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
@@ -262,6 +272,7 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias
; CHECK-ARMPL: [[ENTRY:.*:]]
; CHECK-ARMPL: [[VECTOR_PH:.*:]]
; CHECK-ARMPL: [[VECTOR_BODY:.*:]]
+; CHECK-ARMPL: [[VECTOR_BODY1:.*:]]
; CHECK-ARMPL: [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.modf.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]])
; CHECK-ARMPL: [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.modf.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]])
; CHECK-ARMPL: [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0
@@ -275,6 +286,15 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias
; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]]
; CHECK-ARMPL: [[SCALAR_PH:.*:]]
; CHECK-ARMPL: [[FOR_BODY:.*:]]
+; CHECK-ARMPL: [[VEC_EPILOG_VECTOR_BODY:.*:]]
+; CHECK-ARMPL: [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.modf.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
+; CHECK-ARMPL: [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
+; CHECK-ARMPL: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
+; CHECK-ARMPL: store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
+; CHECK-ARMPL: store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
+; CHECK-ARMPL: [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
+; CHECK-ARMPL: [[VEC_EPILOG_SCALAR_PH:.*:]]
+; CHECK-ARMPL: [[FOR_BODY1:.*:]]
; CHECK-ARMPL: [[CALL:%.*]] = tail call { float, float } @llvm.modf.f32(float [[IN_VAL:%.*]])
; CHECK-ARMPL: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
; CHECK-ARMPL: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
@@ -412,6 +432,7 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
; CHECK-ARMPL: [[ENTRY:.*:]]
; CHECK-ARMPL: [[VECTOR_PH:.*:]]
; CHECK-ARMPL: [[VECTOR_BODY:.*:]]
+; CHECK-ARMPL: [[VECTOR_BODY1:.*:]]
; CHECK-ARMPL: [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincospi.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]])
; CHECK-ARMPL: [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincospi.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]])
; CHECK-ARMPL: [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0
@@ -425,6 +446,15 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]]
; CHECK-ARMPL: [[SCALAR_PH:.*:]]
; CHECK-ARMPL: [[FOR_BODY:.*:]]
+; CHECK-ARMPL: [[VEC_EPILOG_VECTOR_BODY:.*:]]
+; CHECK-ARMPL: [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.sincospi.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
+; CHECK-ARMPL: [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
+; CHECK-ARMPL: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
+; CHECK-ARMPL: store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
+; CHECK-ARMPL: store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
+; CHECK-ARMPL: [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
+; CHECK-ARMPL: [[VEC_EPILOG_SCALAR_PH:.*:]]
+; CHECK-ARMPL: [[FOR_BODY1:.*:]]
; CHECK-ARMPL: [[CALL:%.*]] = tail call { float, float } @llvm.sincospi.f32(float [[IN_VAL:%.*]])
; CHECK-ARMPL: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
; CHECK-ARMPL: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
index ab9b48f..aff2c4c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
@@ -153,17 +153,20 @@ define void @uniform_gep_for_replicating_gep(ptr %dst) {
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[STEP_ADD]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i32> [[STEP_ADD]], zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = lshr i32 [[INDEX]], 1
; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP2]], 1
; CHECK-NEXT: [[TMP11:%.*]] = zext <2 x i1> [[TMP5]] to <2 x i8>
+; CHECK-NEXT: [[TMP6:%.*]] = zext <2 x i1> [[TMP3]] to <2 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP8]] to i64
; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP9]] to i64
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP14]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP15]]
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i8> [[TMP11]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1
; CHECK-NEXT: store i8 [[TMP22]], ptr [[TMP18]], align 1
-; CHECK-NEXT: store i8 [[TMP22]], ptr [[TMP19]], align 1
+; CHECK-NEXT: store i8 [[TMP12]], ptr [[TMP19]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
index 8830ce3..5f79d02 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
@@ -38,8 +38,9 @@ define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out
; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true
; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[EXIT:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
;
entry:
br label %for.body
@@ -96,8 +97,9 @@ define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out
; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true
; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[EXIT:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
;
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
index d447517..f03f743 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
@@ -29,8 +29,9 @@ define void @trip7_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: [[COND:%.*]] = xor i1 [[EXTRACT_FIRST_LANE_MASK]], true
; CHECK-NEXT: br i1 [[COND]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[EXIT:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
;
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
index b8f4e84..753847f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
@@ -33,8 +33,9 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #
; CHECK-NEXT: [[FIRST_LANE_SET:%.*]] = xor i1 [[EXTRACT_FIRST_LANE_MASK]], true
; CHECK-NEXT: br i1 [[FIRST_LANE_SET]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[FOR_END:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[FOR_END:.*]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: ret void
;
entry:
@@ -87,8 +88,9 @@ define void @cond_uniform_load(ptr noalias nocapture %dst, ptr nocapture readonl
; CHECK-NEXT: [[TMP9:%.*]] = xor i1 [[TMP8]], true
; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[FOR_END:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[FOR_END:.*]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: ret void
;
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
index 596e42e..d0c1194 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
@@ -36,7 +36,7 @@ for.body: ; preds = %entry, %for.body
br i1 %exitcond.not, label %exit, label %for.body
}
-define i32 @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 {
+define void @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 {
; CHECK-COST-2: LV: Found an estimated cost of 0 for VF 1 For instruction: %i.032 = phi i32 [ %inc, %if.end ], [ %start, %for.body.preheader ]
; CHECK-COST-2-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.032
@@ -70,7 +70,7 @@ for.cond.cleanup.loopexit: ; preds = %if.end
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- ret i32 undef
+ ret void
for.body: ; preds = %for.body.preheader, %if.end
%i.032 = phi i32 [ %inc, %if.end ], [ %start, %for.body.preheader ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
index e046816..e84c0d6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
@@ -67,7 +67,7 @@ define void @test_may_clobber(ptr %p) {
; CHECK-NEXT: store <4 x i64> [[WIDE_LOAD]], ptr [[TMP4]], align 32
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: exit:
@@ -111,7 +111,7 @@ define void @trivial_due_max_vscale(ptr %p) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: exit:
@@ -155,7 +155,7 @@ define void @no_high_lmul_or_interleave(ptr %p) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: exit:
@@ -207,7 +207,7 @@ define void @safe_load_store_distance_not_pow_of_2(i64 %N) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 24)
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[SCALAR_PH]]
; CHECK: scalar.ph:
@@ -221,7 +221,7 @@ define void @safe_load_store_distance_not_pow_of_2(i64 %N) {
; CHECK-NEXT: store i16 0, ptr [[GEP_OFF]], align 2
; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 3
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 2fbc73e..c66d8d6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -133,7 +133,7 @@ define void @single_constant_stride_int_iv(ptr %p) {
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: exit:
@@ -237,7 +237,7 @@ define void @single_constant_stride_ptr_iv(ptr %p) {
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 8, [[TMP9]]
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP12]]
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: exit:
@@ -346,7 +346,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) {
; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; NOSTRIDED: middle.block:
; NOSTRIDED-NEXT: br label [[EXIT:%.*]]
; NOSTRIDED: scalar.ph:
@@ -360,7 +360,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) {
; NOSTRIDED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4
; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1
; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
; NOSTRIDED: exit:
; NOSTRIDED-NEXT: ret void
;
@@ -468,7 +468,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; NOSTRIDED: middle.block:
; NOSTRIDED-NEXT: br label [[EXIT:%.*]]
; NOSTRIDED: scalar.ph:
@@ -483,7 +483,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
; NOSTRIDED-NEXT: [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]]
; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1
; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
; NOSTRIDED: exit:
; NOSTRIDED-NEXT: ret void
;
@@ -640,7 +640,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]]
; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
; NOSTRIDED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; NOSTRIDED: middle.block:
; NOSTRIDED-NEXT: br label [[EXIT:%.*]]
; NOSTRIDED: scalar.ph:
@@ -656,7 +656,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; NOSTRIDED-NEXT: store i32 [[Y0]], ptr [[Q1]], align 4
; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1
; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
; NOSTRIDED: exit:
; NOSTRIDED-NEXT: ret void
;
@@ -790,14 +790,14 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT9]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; STRIDED-NEXT: [[TMP18:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT1]]
; STRIDED-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP18]]
-; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META6:![0-9]+]]
+; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META5:![0-9]+]]
; STRIDED-NEXT: [[TMP20:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
; STRIDED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP18]]
-; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
+; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META8:![0-9]+]], !noalias [[META5]]
; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP44]]
; STRIDED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; STRIDED-NEXT: [[TMP41:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; STRIDED-NEXT: br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; STRIDED-NEXT: br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; STRIDED: middle.block:
; STRIDED-NEXT: br label [[EXIT:%.*]]
; STRIDED: scalar.ph:
@@ -813,7 +813,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-NEXT: store i32 [[Y0]], ptr [[Q1]], align 4
; STRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1
; STRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
+; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
; STRIDED: exit:
; STRIDED-NEXT: ret void
;
@@ -965,7 +965,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; NOSTRIDED: middle.block:
; NOSTRIDED-NEXT: br label [[EXIT:%.*]]
; NOSTRIDED: scalar.ph:
@@ -981,7 +981,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
; NOSTRIDED-NEXT: [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]]
; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1
; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
; NOSTRIDED: exit:
; NOSTRIDED-NEXT: ret void
;
@@ -1145,16 +1145,16 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI11]], <vscale x 4 x i64> [[TMP18]]
; STRIDED-NEXT: [[VECTOR_GEP7:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 4 x i64> [[TMP18]]
; STRIDED-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[VECTOR_GEP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META13:![0-9]+]]
+; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[VECTOR_GEP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META12:![0-9]+]]
; STRIDED-NEXT: [[TMP30:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
-; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP30]], <vscale x 4 x ptr> align 4 [[VECTOR_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META16:![0-9]+]], !noalias [[META13]]
+; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP30]], <vscale x 4 x ptr> align 4 [[VECTOR_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META15:![0-9]+]], !noalias [[META12]]
; STRIDED-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
; STRIDED-NEXT: [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP16]]
; STRIDED-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP25]]
; STRIDED-NEXT: [[PTR_IND12]] = getelementptr i8, ptr [[POINTER_PHI11]], i64 [[TMP25]]
; STRIDED-NEXT: [[TMP23:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; STRIDED-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; STRIDED-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; STRIDED: middle.block:
; STRIDED-NEXT: br label [[EXIT:%.*]]
; STRIDED: scalar.ph:
@@ -1170,7 +1170,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-NEXT: [[PTR2_NEXT]] = getelementptr inbounds i8, ptr [[PTR2]], i64 [[STRIDE]]
; STRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1
; STRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP18:![0-9]+]]
; STRIDED: exit:
; STRIDED-NEXT: ret void
;
@@ -1318,7 +1318,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
; NOSTRIDED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; NOSTRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; NOSTRIDED: middle.block:
; NOSTRIDED-NEXT: br label [[LOOP:%.*]]
; NOSTRIDED: exit:
@@ -1402,7 +1402,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
; STRIDED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; STRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; STRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; STRIDED: middle.block:
; STRIDED-NEXT: br label [[LOOP:%.*]]
; STRIDED: exit:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index 0c22a9e..46daee4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -142,7 +142,7 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) {
; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]])
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
@@ -267,7 +267,7 @@ define void @store_factor_4_with_gap(i32 %n, ptr noalias %a) {
; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]]
; IF-EVL-NEXT: [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; IF-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
; IF-EVL: exit:
@@ -382,7 +382,7 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) {
; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]])
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
@@ -508,7 +508,7 @@ define void @store_factor_4_with_tail_gap(i32 %n, ptr noalias %a) {
; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]]
; IF-EVL-NEXT: [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; IF-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
; IF-EVL: exit:
@@ -621,7 +621,7 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]])
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index bae97e5..c34417b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -129,7 +129,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
; SCALABLE-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALABLE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; SCALABLE: [[MIDDLE_BLOCK]]:
; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
; SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
@@ -143,7 +143,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
; SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; SCALABLE: [[FOR_END]]:
; SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
; SCALABLE-NEXT: ret i64 [[V_LCSSA]]
@@ -204,7 +204,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
; TF-SCALABLE-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[ARRAYIDX]], align 8
; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP3]]
; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
; TF-SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
@@ -218,7 +218,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
; TF-SCALABLE-NEXT: store i64 [[V1]], ptr [[ARRAYIDX1]], align 8
; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; TF-SCALABLE: [[FOR_END]]:
; TF-SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V1]], %[[FOR_BODY]] ], [ [[V]], %[[MIDDLE_BLOCK]] ]
; TF-SCALABLE-NEXT: ret i64 [[V_LCSSA]]
@@ -269,7 +269,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; SCALABLE: [[MIDDLE_BLOCK]]:
; SCALABLE-NEXT: br label %[[FOR_END:.*]]
; SCALABLE: [[FOR_END]]:
@@ -350,7 +350,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; TF-SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[FOR_END]]:
@@ -399,7 +399,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; SCALABLE-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; SCALABLE: [[MIDDLE_BLOCK]]:
; SCALABLE-NEXT: br label %[[FOR_END:.*]]
; SCALABLE: [[FOR_END]]:
@@ -457,7 +457,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[FOR_END]]:
@@ -499,7 +499,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]]
; SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; SCALABLE: [[MIDDLE_BLOCK]]:
; SCALABLE-NEXT: br label %[[FOR_END:.*]]
; SCALABLE: [[FOR_END]]:
@@ -557,7 +557,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]]
; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[FOR_END]]:
@@ -608,7 +608,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; SCALABLE: [[MIDDLE_BLOCK]]:
; SCALABLE-NEXT: br label %[[FOR_END:.*]]
; SCALABLE: [[FOR_END]]:
@@ -679,7 +679,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[FOR_END]]:
@@ -731,7 +731,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]]
; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SCALABLE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; SCALABLE: [[MIDDLE_BLOCK]]:
; SCALABLE-NEXT: br label %[[FOR_END:.*]]
; SCALABLE: [[FOR_END]]:
@@ -812,7 +812,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; TF-SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[FOR_END]]:
@@ -860,7 +860,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]]
; SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; SCALABLE: [[MIDDLE_BLOCK]]:
; SCALABLE-NEXT: br label %[[FOR_END:.*]]
; SCALABLE: [[FOR_END]]:
@@ -918,7 +918,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]]
; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[FOR_END]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/avx1.ll b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
index 9e20586..44fb8cb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-apple-macosx10.8.0"
; CHECK-LABEL: @read_mod_write_single_ptr(
; CHECK: load <8 x float>
-; CHECK: ret i32
-define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+; CHECK: ret void
+define void @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -23,15 +23,15 @@ define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge: ; preds = %.lr.ph, %0
- ret i32 undef
+ ret void
}
; CHECK-LABEL: @read_mod_i64(
; SLOWMEM32: load <2 x i64>
; FASTMEM32: load <4 x i64>
-; CHECK: ret i32
-define i32 @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+; CHECK: ret void
+define void @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -47,6 +47,6 @@ define i32 @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge: ; preds = %.lr.ph, %0
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
index e11b1ad..27d5e64 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -166,7 +166,6 @@ attributes #0 = { "target-cpu"="knl" }
; CHECK: LV: Found uniform instruction: {{%.*}} = icmp eq i32 {{%.*}}, 0
; CHECK-NOT: LV: Found uniform instruction: {{%.*}} = load i32, ptr {{%.*}}, align 1
; CHECK: LV: Found not uniform due to requiring predication: {{%.*}} = load i32, ptr {{%.*}}, align 1
-; CHECK: LV: Found scalar instruction: {{%.*}} = getelementptr inbounds [3 x i32], ptr @a, i32 0, i32 {{%.*}}
;
;
@a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1
@@ -215,8 +214,9 @@ define void @PR40816() #1 {
; FORCE-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
; FORCE-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; FORCE: [[MIDDLE_BLOCK]]:
-; FORCE-NEXT: br [[RETURN:label %.*]]
-; FORCE: [[SCALAR_PH:.*:]]
+; FORCE-NEXT: br label %[[RETURN:.*]]
+; FORCE: [[RETURN]]:
+; FORCE-NEXT: ret void
;
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index 6d2cda4..0287645 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -4,7 +4,7 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
-define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
+define void @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
; CHECK-LABEL: @conversion_cost1(
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 3
; CHECK-NEXT: br i1 [[TMP1]], label [[ITER_CHECK:%.*]], label [[DOT_CRIT_EDGE:%.*]]
@@ -37,7 +37,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: [[IND_END5:%.*]] = add i64 3, [[N_VEC]]
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -58,7 +58,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 4
; CHECK-NEXT: [[VEC_IND_NEXT9]] = add <4 x i8> [[VEC_IND8]], splat (i8 4)
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC3]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
; CHECK-NEXT: br i1 [[CMP_N12]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -73,11 +73,11 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: ._crit_edge.loopexit:
; CHECK-NEXT: br label [[DOT_CRIT_EDGE]]
; CHECK: ._crit_edge:
-; CHECK-NEXT: ret i32 undef
+; CHECK-NEXT: ret void
;
%1 = icmp sgt i32 %n, 3
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -93,10 +93,10 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge: ; preds = %.lr.ph, %0
- ret i32 undef
+ ret void
}
-define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
+define void @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
; CHECK-LABEL: @conversion_cost2(
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 9
; CHECK-NEXT: br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
@@ -136,7 +136,7 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD_3]], splat (i64 2)
; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -152,11 +152,11 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: ._crit_edge.loopexit:
; CHECK-NEXT: br label [[DOT_CRIT_EDGE]]
; CHECK: ._crit_edge:
-; CHECK-NEXT: ret i32 undef
+; CHECK-NEXT: ret void
;
%1 = icmp sgt i32 %n, 9
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -173,5 +173,5 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge: ; preds = %.lr.ph, %0
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 9453ad7..725fa49 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -540,6 +540,8 @@ define i64 @cost_assume(ptr %end, i64 %N) {
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i64 [[N:%.*]], 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -551,14 +553,6 @@ define i64 @cost_assume(ptr %end, i64 %N) {
; CHECK-NEXT: [[TMP8]] = add <2 x i64> [[VEC_PHI2]], splat (i64 1)
; CHECK-NEXT: [[TMP9]] = add <2 x i64> [[VEC_PHI3]], splat (i64 1)
; CHECK-NEXT: [[TMP10]] = add <2 x i64> [[VEC_PHI4]], splat (i64 1)
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
index af5c921..fa3b4a66 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux"
;CHECK-LABEL: func1x6(
;CHECK: <4 x i32>
;CHECK: ret
-define i32 @func1x6(ptr nocapture %out, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
+define void @func1x6(ptr nocapture %out, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
entry:
br label %for.body
@@ -40,14 +40,14 @@ for.body: ; preds = %for.body, %entry
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
- ret i32 undef
+ ret void
}
; We are vectorizing with 12 runtime checks.
;CHECK-LABEL: func2x6(
;CHECK: <4 x i32>
;CHECK: ret
-define i32 @func2x6(ptr nocapture %out, ptr nocapture %out2, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
+define void @func2x6(ptr nocapture %out, ptr nocapture %out2, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
entry:
br label %for.body
@@ -85,5 +85,5 @@ for.body: ; preds = %for.body, %entry
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
index 8971dfe..47355e7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-apple-macosx10.8.0"
;CHECK-NOUNRL: store <4 x i32>
;CHECK-NOUNRL-NOT: store <4 x i32>
;CHECK-NOUNRL: ret
-define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
+define void @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -27,5 +27,5 @@ define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge: ; preds = %.lr.ph, %0
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
index 28de5c7..56f0b85 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -58,7 +58,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly
; CHECK: middle.block:
; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF5:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
@@ -72,7 +72,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly
; CHECK-NEXT: store <4 x float> [[TMP21]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP0]]
; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 4
; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 20
-; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
@@ -88,7 +88,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly
; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
@@ -132,14 +132,14 @@ define void @vectorized1(ptr noalias nocapture %A, ptr noalias nocapture readonl
; CHECK-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 19)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7:![0-9]+]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP8:![0-9]+]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP8]]
; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP8]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.end:
@@ -180,14 +180,14 @@ define void @vectorized2(ptr noalias nocapture %A, ptr noalias nocapture readonl
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP8]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP8]]
; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP8]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/assume.ll b/llvm/test/Transforms/LoopVectorize/assume.ll
index 65c12a1..224ec4a6 100644
--- a/llvm/test/Transforms/LoopVectorize/assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/assume.ll
@@ -34,8 +34,9 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b)
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[FOR_END:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[FOR_END:.*]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: ret void
;
entry:
br label %for.body
@@ -73,29 +74,28 @@ define void @test2(ptr noalias %a, ptr noalias %b) {
; CHECK-NEXT: [[MASKCOND4:%.*]] = icmp eq i64 [[MASKEDPTR3]], 0
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 2
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP4]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00)
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 2
; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[TMP7]], align 4
; CHECK-NEXT: store <2 x float> [[TMP6]], ptr [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[FOR_END:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[FOR_END:.*]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: ret void
;
entry:
%ptrint = ptrtoint ptr %a to i64
@@ -163,7 +163,7 @@ define void @predicated_assume(ptr noalias nocapture readonly %a, ptr noalias no
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], [[FOR_COND_CLEANUP_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
index f64255f..b7aa958 100644
--- a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
+++ b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
@@ -8,7 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; When scalarizing stores we need to preserve the original order.
; Make sure that we are extracting in the correct order (0101, and not 0011).
-define i32 @foo(ptr nocapture %A) {
+define void @foo(ptr nocapture %A) {
; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
@@ -39,7 +39,7 @@ define i32 @foo(ptr nocapture %A) {
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.end:
-; CHECK-NEXT: ret i32 undef
+; CHECK-NEXT: ret void
;
entry:
br label %for.body
@@ -55,7 +55,7 @@ for.body:
br i1 %exitcond, label %for.end, label %for.body
for.end:
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
index 1588d02..51255b2 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
@@ -3,7 +3,7 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
+define void @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -73,7 +73,7 @@ define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
; CHECK: for.end.loopexit:
; CHECK-NEXT: br label [[FOR_END]]
; CHECK: for.end:
-; CHECK-NEXT: ret i32 undef
+; CHECK-NEXT: ret void
;
entry:
%cmp26 = icmp sgt i32 %n, 0
@@ -106,11 +106,11 @@ if.end14:
br i1 %exitcond, label %for.end, label %for.body
for.end:
- ret i32 undef
+ ret void
}
; As above but with multiple variables set per block.
-define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) {
+define void @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) {
; CHECK-LABEL: @multi_variable_if_nest(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -188,7 +188,7 @@ define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) {
; CHECK: for.end.loopexit:
; CHECK-NEXT: br label [[FOR_END]]
; CHECK: for.end:
-; CHECK-NEXT: ret i32 undef
+; CHECK-NEXT: ret void
;
entry:
%cmp26 = icmp sgt i32 %n, 0
@@ -224,5 +224,5 @@ if.end14:
br i1 %exitcond, label %for.end, label %for.body
for.end:
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion.ll b/llvm/test/Transforms/LoopVectorize/if-conversion.ll
index 8a7f4a3..a88a9b14 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion.ll
@@ -17,8 +17,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; }
;}
-define i32 @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp {
-; CHECK-LABEL: define i32 @function0(
+define void @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp {
+; CHECK-LABEL: define void @function0(
; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[START:%.*]], i32 [[END:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[CMP16:%.*]] = icmp slt i32 [[START]], [[END]]
@@ -94,7 +94,7 @@ define i32 @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end)
; CHECK: [[FOR_END_LOOPEXIT]]:
; CHECK-NEXT: br label %[[FOR_END]]
; CHECK: [[FOR_END]]:
-; CHECK-NEXT: ret i32 undef
+; CHECK-NEXT: ret void
;
entry:
%cmp16 = icmp slt i32 %start, %end
@@ -127,7 +127,7 @@ if.end:
br i1 %cmp, label %for.body, label %for.end
for.end:
- ret i32 undef
+ ret void
}
@@ -237,6 +237,8 @@ for.end: ; preds = %for.inc, %entry
; Handle PHI with single incoming value having a full mask.
; PR34523
+; NOTE: Changing PHI inputs from undef to poison leads to change in
+; behaviour of the test. Left as undef for now.
define void @PR34523() {
; CHECK-LABEL: define void @PR34523() {
; CHECK-NEXT: [[BB1:.*:]]
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index 742ee64..eea2237 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -337,7 +337,7 @@ for.end: ; preds = %for.body
; }
; }
-define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
+define void @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
; CHECK-LABEL: @multiple_uniform_stores(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0
@@ -429,7 +429,7 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly
; CHECK: for.end10.loopexit:
; CHECK-NEXT: br label [[FOR_END10]]
; CHECK: for.end10:
-; CHECK-NEXT: ret i32 undef
+; CHECK-NEXT: ret void
;
entry:
%cmp20 = icmp eq i32 %itr, 0
@@ -469,12 +469,12 @@ for.inc8: ; preds = %for.body3, %for.con
br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
for.end10: ; preds = %for.inc8, %entry
- ret i32 undef
+ ret void
}
; second uniform store to the same address is conditional.
; we do not vectorize this.
-define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
+define void @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
; CHECK-LABEL: @multiple_uniform_stores_conditional(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0
@@ -520,7 +520,7 @@ define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocaptu
; CHECK: for.end10.loopexit:
; CHECK-NEXT: br label [[FOR_END10]]
; CHECK: for.end10:
-; CHECK-NEXT: ret i32 undef
+; CHECK-NEXT: ret void
;
entry:
%cmp20 = icmp eq i32 %itr, 0
@@ -567,7 +567,7 @@ for.inc8: ; preds = %for.body3, %for.con
br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
for.end10: ; preds = %for.inc8, %entry
- ret i32 undef
+ ret void
}
; cannot vectorize loop with unsafe dependency between uniform load (%i10) and store
diff --git a/llvm/test/Transforms/LoopVectorize/memdep.ll b/llvm/test/Transforms/LoopVectorize/memdep.ll
index b891b43..d9d9eec 100644
--- a/llvm/test/Transforms/LoopVectorize/memdep.ll
+++ b/llvm/test/Transforms/LoopVectorize/memdep.ll
@@ -132,7 +132,7 @@ for.end:
; CHECK-LABEL: @f6
; CHECK-NOT: <2 x i32>
-define i32 @f6(ptr %a, i32 %tmp) {
+define void @f6(ptr %a, i32 %tmp) {
entry:
br label %for.body
@@ -149,7 +149,7 @@ for.body:
br i1 %exitcond, label %for.body, label %for.end
for.end:
- ret i32 undef
+ ret void
}
; Don't vectorize true loop carried dependencies that are not a multiple of the
diff --git a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
index 1533906..53dad3a 100644
--- a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
+++ b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
@@ -74,8 +74,7 @@ exit:
ret void
}
-; FIXME: Currently this mis-compiled when interleaving; all stores store the
-; last lane of the last part, instead of the last lane per part.
+; Check each unrolled store stores the last lane of the corresponding part.
; Test case for https://github.com/llvm/llvm-project/issues/162498.
define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(ptr %dst) {
; VF4IC1-LABEL: define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(
@@ -121,13 +120,15 @@ define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(
; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2IC2: [[VECTOR_BODY]]:
; VF2IC2-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2IC2-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 0
+; VF2IC2-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 1
; VF2IC2-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 2
; VF2IC2-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 3
-; VF2IC2-NEXT: [[TMP2:%.*]] = lshr i32 [[INDEX]], 1
+; VF2IC2-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP7]], 1
; VF2IC2-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP0]], 1
; VF2IC2-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP2]]
; VF2IC2-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP3]]
-; VF2IC2-NEXT: store i32 [[TMP1]], ptr [[TMP4]], align 4
+; VF2IC2-NEXT: store i32 [[TMP8]], ptr [[TMP4]], align 4
; VF2IC2-NEXT: store i32 [[TMP1]], ptr [[TMP5]], align 4
; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; VF2IC2-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
diff --git a/llvm/test/Transforms/LoopVectorize/operand-bundles.ll b/llvm/test/Transforms/LoopVectorize/operand-bundles.ll
new file mode 100644
index 0000000..ce07364
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/operand-bundles.ll
@@ -0,0 +1,227 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+define void @call_loop_invariant_operand_bundle(ptr %dst, {float, float} %sv) {
+; CHECK-LABEL: define void @call_loop_invariant_operand_bundle(
+; CHECK-SAME: ptr [[DST:%.*]], { float, float } [[SV:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { float, float } [[SV]], 0
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { float, float } [[SV]], 1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[DST]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT2]]) [ "deopt"(float 1.000000e+01) ]
+; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+ %a = extractvalue { float, float } %sv, 0
+ %b = extractvalue { float, float } %sv, 1
+ %addr = getelementptr float, ptr %dst, i32 %iv
+ %p = call float @llvm.pow.f32(float %a, float %b) [ "deopt"(float 10.0) ]
+ store float %p, ptr %addr
+ %iv.next = add nsw i32 %iv, 1
+ %cond = icmp ne i32 %iv.next, 1000
+ br i1 %cond, label %loop, label %exit
+
+exit:
+ ret void
+}
+
+define void @call_unknown_operand_bundle(ptr %dst, {float, float} %sv) {
+; CHECK-LABEL: define void @call_unknown_operand_bundle(
+; CHECK-SAME: ptr [[DST:%.*]], { float, float } [[SV:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { float, float } [[SV]], 0
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { float, float } [[SV]], 1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[DST]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT2]]) [ "unknown"(ptr null) ]
+; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+ %a = extractvalue { float, float } %sv, 0
+ %b = extractvalue { float, float } %sv, 1
+ %addr = getelementptr float, ptr %dst, i32 %iv
+ %p = call float @llvm.pow.f32(float %a, float %b) [ "unknown"(ptr null) ]
+ store float %p, ptr %addr
+ %iv.next = add nsw i32 %iv, 1
+ %cond = icmp ne i32 %iv.next, 1000
+ br i1 %cond, label %loop, label %exit
+
+exit:
+ ret void
+}
+
+define void @call_cold_operand_bundle(ptr %dst, {float, float} %sv) {
+; CHECK-LABEL: define void @call_cold_operand_bundle(
+; CHECK-SAME: ptr [[DST:%.*]], { float, float } [[SV:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { float, float } [[SV]], 0
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { float, float } [[SV]], 1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[DST]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT2]]) [ "cold"() ]
+; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+ %a = extractvalue { float, float } %sv, 0
+ %b = extractvalue { float, float } %sv, 1
+ %addr = getelementptr float, ptr %dst, i32 %iv
+ %p = call float @llvm.pow.f32(float %a, float %b) [ "cold"() ]
+ store float %p, ptr %addr
+ %iv.next = add nsw i32 %iv, 1
+ %cond = icmp ne i32 %iv.next, 1000
+ br i1 %cond, label %loop, label %exit
+
+exit:
+ ret void
+}
+
+define void @assume_loop_variant_operand_bundle(ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define void @assume_loop_variant_operand_bundle(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP8]], align 4
+; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP0]]) ]
+; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP1]]) ]
+; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP2]]) ]
+; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP3]]) ]
+; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP10]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
+; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds float, ptr %b, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %cmp1 = fcmp ogt float %0, 1.000000e+02
+ tail call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 %iv) ]
+ %add = fadd float %0, 1.000000e+00
+ %arrayidx5 = getelementptr inbounds float, ptr %a, i64 %iv
+ store float %add, ptr %arrayidx5, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv, 1599
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @assume_cold_operand_bundle(ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define void @assume_cold_operand_bundle(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "cold"() ]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[TMP2]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
+; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds float, ptr %b, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %cmp1 = fcmp ogt float %0, 1.000000e+02
+ tail call void @llvm.assume(i1 true) [ "cold"() ]
+ %add = fadd float %0, 1.000000e+00
+ %arrayidx5 = getelementptr inbounds float, ptr %a, i64 %iv
+ store float %add, ptr %arrayidx5, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv, 1599
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll b/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll
index d700d48..f5e480c 100644
--- a/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll
+++ b/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll
@@ -10,7 +10,7 @@
; CHECK: store i64 %indvars.outer, ptr %O2, align 4
-define i64 @foo(ptr nocapture %A, ptr nocapture %B, i64 %n, i64 %m, ptr %O1, ptr %O2) {
+define void @foo(ptr nocapture %A, ptr nocapture %B, i64 %n, i64 %m, ptr %O1, ptr %O2) {
entry:
%cmp = icmp sgt i64 %n, 0
br i1 %cmp, label %for.body.outer.preheader, label %for.end.outer
@@ -50,5 +50,5 @@ for.end.outer.loopexit: ; preds = %for.end.inner
br label %for.end.outer
for.end.outer: ; preds = %for.end.outer.loopexit, %entry
- ret i64 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/pr28541.ll b/llvm/test/Transforms/LoopVectorize/pr28541.ll
index ad7f6e7..0a9c8c1 100644
--- a/llvm/test/Transforms/LoopVectorize/pr28541.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr28541.ll
@@ -28,7 +28,7 @@
; CHECK-NOT: vectorized loop
; CHECK-LABEL: fn1
-define i32 @fn1() {
+define void @fn1() {
entry:
%tmp2 = load i32, ptr @b, align 4
%dec3 = add nsw i32 %tmp2, -1
@@ -67,5 +67,5 @@ while.cond.while.end_crit_edge: ; preds = %while.cond
br label %while.end
while.end: ; preds = %while.cond.while.end_crit_edge, %entry
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
index f87be5a..6ea227f 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; a[i] = b[i] * 3;
; }
-define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp {
+define void @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp {
; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64, !dbg [[DBG4:![0-9]+]]
@@ -58,7 +58,7 @@ define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp
; CHECK: for.end.loopexit:
; CHECK-NEXT: br label [[FOR_END]], !dbg [[DBG14:![0-9]+]]
; CHECK: for.end:
-; CHECK-NEXT: ret i32 undef, !dbg [[DBG14]]
+; CHECK-NEXT: ret void, !dbg [[DBG14]]
;
; FORCED_OPTSIZE-LABEL: @foo(
; FORCED_OPTSIZE-NEXT: entry:
@@ -80,7 +80,7 @@ define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp
; FORCED_OPTSIZE: for.end.loopexit:
; FORCED_OPTSIZE-NEXT: br label [[FOR_END]], !dbg [[DBG10:![0-9]+]]
; FORCED_OPTSIZE: for.end:
-; FORCED_OPTSIZE-NEXT: ret i32 undef, !dbg [[DBG10]]
+; FORCED_OPTSIZE-NEXT: ret void, !dbg [[DBG10]]
;
entry:
%cmp6 = icmp sgt i32 %n, 0, !dbg !6
@@ -99,7 +99,7 @@ for.body: ; preds = %entry, %for.body
br i1 %exitcond, label %for.end, label %for.body, !dbg !7
for.end: ; preds = %for.body, %entry
- ret i32 undef, !dbg !8
+ ret void, !dbg !8
}
; Make sure that we try to vectorize loops with a runtime check if the
@@ -505,11 +505,11 @@ define void @test_scev_check_mul_add_expansion(ptr %out, ptr %in, i32 %len, i32
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], 6
; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[OFFSET_IDX]] to i64
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[OUT]], i64 [[TMP6]]
-; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2, !alias.scope [[META37:![0-9]+]], !noalias [[META40:![0-9]+]]
-; CHECK-NEXT: store i32 0, ptr [[IN]], align 4, !alias.scope [[META40]]
+; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2, !alias.scope [[META36:![0-9]+]], !noalias [[META39:![0-9]+]]
+; CHECK-NEXT: store i32 0, ptr [[IN]], align 4, !alias.scope [[META39]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -524,7 +524,7 @@ define void @test_scev_check_mul_add_expansion(ptr %out, ptr %in, i32 %len, i32
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
; CHECK-NEXT: store i32 0, ptr [[IN]], align 4
; CHECK-NEXT: [[CMP7_NOT:%.*]] = icmp sgt i32 [[LEN]], [[IV_NEXT]]
-; CHECK-NEXT: br i1 [[CMP7_NOT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP43:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP7_NOT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP42:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
index ad8cd42..667df3a 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
@@ -88,11 +88,11 @@ define void @test2(ptr %a, ptr noalias %b) {
; CHECK-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1600, [[TMP7]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1600, [[N_MOD_VF]]
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 1
@@ -101,8 +101,6 @@ define void @test2(ptr %a, ptr noalias %b) {
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x float>, ptr [[TMP13]], align 4
; CHECK-NEXT: [[TMP14:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
; CHECK-NEXT: [[TMP15:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD3]], splat (float 1.000000e+00)
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/write-only.ll b/llvm/test/Transforms/LoopVectorize/write-only.ll
index cc21b94..8df71e83 100644
--- a/llvm/test/Transforms/LoopVectorize/write-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/write-only.ll
@@ -4,8 +4,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
;CHECK-LABEL: @read_mod_write_single_ptr(
;CHECK: load <4 x float>
-;CHECK: ret i32
-define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+;CHECK: ret void
+define void @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -21,14 +21,14 @@ define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge: ; preds = %.lr.ph, %0
- ret i32 undef
+ ret void
}
; Ensure that volatile stores are not vectorized.
; CHECK-LABEL: @read_mod_write_single_ptr_volatile_store(
; CHECK-NOT: store <4 x float>
-; CHECK: ret i32
-define i32 @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+; CHECK: ret void
+define void @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -44,5 +44,5 @@ define i32 @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) n
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge: ; preds = %.lr.ph, %0
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll
index f1ffcc7..239397b 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll
@@ -17,7 +17,7 @@
define void @_Z11hotFunctionbiiPiS_S_(i1 %cond, i32 %M, i32 %N, ptr %A, ptr %B, ptr %C) !prof !36 {
; CHECK-LABEL: define void @_Z11hotFunctionbiiPiS_S_
-; CHECK-SAME: (i1 [[COND:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) !prof [[PROF16:![0-9]+]] {
+; CHECK-SAME: (i1 [[COND:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {{.*}}{
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP19_NOT:%.*]] = icmp eq i32 [[M]], 0
; CHECK-NEXT: br i1 [[CMP19_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], !prof [[PROF17:![0-9]+]]