diff options
Diffstat (limited to 'llvm/test/Transforms')
58 files changed, 2133 insertions, 602 deletions
diff --git a/llvm/test/Transforms/GVN/PRE/pre-load.ll b/llvm/test/Transforms/GVN/PRE/pre-load.ll index 5a07f9f..afa1354 100644 --- a/llvm/test/Transforms/GVN/PRE/pre-load.ll +++ b/llvm/test/Transforms/GVN/PRE/pre-load.ll @@ -1503,3 +1503,51 @@ wrong: exit: ret void } + +; Allow the load to be made available on the edge (%entry, %if.end) as part of PRE, +; but ensure `%identical.l` is not hoisted to its predecessor due to the local +; dependency with the call. + +define i32 @test24(ptr noalias %p, ptr noalias %q, i1 %c) { +; MDEP-LABEL: @test24( +; MDEP-NEXT: entry: +; MDEP-NEXT: br i1 [[C:%.*]], label [[ENTRY_IF_END_CRIT_EDGE:%.*]], label [[IF_THEN:%.*]] +; MDEP: entry.if.end_crit_edge: +; MDEP-NEXT: [[VV_PRE:%.*]] = load i32, ptr [[X:%.*]], align 4 +; MDEP-NEXT: br label [[IF_END:%.*]] +; MDEP: if.then: +; MDEP-NEXT: call void @opaque(ptr [[X]]) +; MDEP-NEXT: [[UU:%.*]] = load i32, ptr [[X]], align 4 +; MDEP-NEXT: store i32 [[UU]], ptr [[R:%.*]], align 4 +; MDEP-NEXT: br label [[IF_END]] +; MDEP: if.end: +; MDEP-NEXT: [[VV:%.*]] = phi i32 [ [[VV_PRE]], [[ENTRY_IF_END_CRIT_EDGE]] ], [ [[UU]], [[IF_THEN]] ] +; MDEP-NEXT: ret i32 [[VV]] +; +; MSSA-LABEL: @test24( +; MSSA-NEXT: entry: +; MSSA-NEXT: br i1 [[C:%.*]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; MSSA: if.then: +; MSSA-NEXT: call void @opaque(ptr [[X:%.*]]) +; MSSA-NEXT: [[UU:%.*]] = load i32, ptr [[X]], align 4 +; MSSA-NEXT: store i32 [[UU]], ptr [[R:%.*]], align 4 +; MSSA-NEXT: br label [[IF_END]] +; MSSA: if.end: +; MSSA-NEXT: [[VV:%.*]] = load i32, ptr [[X]], align 4 +; MSSA-NEXT: ret i32 [[VV]] +; +entry: + br i1 %c, label %if.end, label %if.then + +if.then: + call void @opaque(ptr %p) + %identical.l = load i32, ptr %p, align 4 + store i32 %identical.l, ptr %q, align 4 + br label %if.end + +if.end: + %l = load i32, ptr %p, align 4 + ret i32 %l +} + +declare void @opaque(ptr) nounwind willreturn diff --git a/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll b/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll index cb4e07e..9b9bc68 100644 --- a/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll +++ b/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll @@ -60,8 +60,7 @@ define void @f_sadd_overflow(ptr %a) { ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 2147483645, %[[ENTRY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 2147483647 -; CHECK-NEXT: br i1 [[EXITCOND]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]] +; CHECK-NEXT: br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]] ; CHECK: [[TRAP]]: ; CHECK-NEXT: tail call void @llvm.trap(), !nosanitize [[META0]] ; CHECK-NEXT: unreachable, !nosanitize [[META0]] @@ -150,8 +149,7 @@ define void @f_uadd_overflow(ptr %a) { ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ -6, %[[ENTRY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], -1 -; CHECK-NEXT: br i1 [[EXITCOND]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]] +; CHECK-NEXT: br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]] ; CHECK: [[TRAP]]: ; CHECK-NEXT: tail call void @llvm.trap(), !nosanitize [[META0]] ; CHECK-NEXT: unreachable, !nosanitize [[META0]] @@ -243,10 +241,7 @@ define void @f_ssub_overflow(ptr nocapture %a) { ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ -2147483642, %[[ENTRY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[TMP0:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP1:%.*]] = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[TMP0]], i32 1) -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1 -; CHECK-NEXT: br i1 [[TMP2]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]] +; CHECK-NEXT: br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]] ; CHECK: [[TRAP]]: ; CHECK-NEXT: tail call void @llvm.trap(), !nosanitize [[META0]] ; CHECK-NEXT: unreachable, !nosanitize [[META0]] @@ -339,10 +334,7 @@ define void @f_usub_overflow(ptr nocapture %a) { ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 15, %[[ENTRY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP1:%.*]] = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[TMP0]], i32 1) -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1 -; CHECK-NEXT: br i1 [[TMP2]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]] +; CHECK-NEXT: br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]] ; CHECK: [[TRAP]]: ; CHECK-NEXT: tail call void @llvm.trap(), !nosanitize [[META0]] ; CHECK-NEXT: unreachable, !nosanitize [[META0]] diff --git a/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll b/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll index 9371fe2..dbd572d 100644 --- a/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll +++ b/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt -p indvars -S %s | FileCheck %s +; RUN: opt -p indvars -data-layout='n32:64' -S %s | FileCheck --check-prefix=N32 %s declare i1 @cond() @@ -28,6 +29,32 @@ define i64 @test_ptr_compare_guard(ptr %start, ptr %end) { ; CHECK-NEXT: [[RES:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[RES_PH]], %[[EXIT_LOOPEXIT]] ] ; CHECK-NEXT: ret i64 [[RES]] ; +; N32-LABEL: define i64 @test_ptr_compare_guard( +; N32-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) { +; N32-NEXT: [[ENTRY:.*]]: +; N32-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; N32-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; N32-NEXT: [[C_0:%.*]] = icmp eq ptr [[START]], [[END]] +; N32-NEXT: br i1 [[C_0]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]] +; N32: [[LOOP_HEADER_PREHEADER]]: +; N32-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -1 +; N32-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; N32-NEXT: br label %[[LOOP_HEADER:.*]] +; N32: [[LOOP_HEADER]]: +; N32-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[LOOP_HEADER_PREHEADER]] ] +; N32-NEXT: [[C_1:%.*]] = call i1 @cond() +; N32-NEXT: br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[EXIT_LOOPEXIT:.*]] +; N32: [[LOOP_LATCH]]: +; N32-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1 +; N32-NEXT: [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] +; N32-NEXT: br i1 [[C_2]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_HEADER]] +; N32: [[EXIT_LOOPEXIT]]: +; N32-NEXT: [[RES_PH:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[TMP1]], %[[LOOP_LATCH]] ] +; N32-NEXT: br label %[[EXIT]] +; N32: [[EXIT]]: +; N32-NEXT: [[RES:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[RES_PH]], %[[EXIT_LOOPEXIT]] ] +; N32-NEXT: ret i64 [[RES]] +; entry: %c.0 = icmp eq ptr %start, %end br i1 %c.0, label %exit, label %loop.header @@ -48,3 +75,149 @@ exit: %res = phi i64 [ 0, %entry ], [ %i64.iv, %loop.latch ], [ 0, %loop.header ] ret i64 %res } + +define void @test_sub_cmp(ptr align 8 %start, ptr %end) { +; CHECK-LABEL: define void @test_sub_cmp( +; CHECK-SAME: ptr align 8 [[START:%.*]], ptr [[END:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64 +; CHECK-NEXT: [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64 +; CHECK-NEXT: [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]] +; CHECK-NEXT: [[CMP_ENTRY:%.*]] = icmp eq ptr [[START]], [[END]] +; CHECK-NEXT: br i1 [[CMP_ENTRY]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]] +; CHECK: [[LOOP_HEADER_PREHEADER]]: +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[LOOP_HEADER_PREHEADER]] ] +; CHECK-NEXT: [[C_1:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_1]], label %[[EXIT_EARLY:.*]], label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 +; CHECK-NEXT: [[CMP_LATCH:%.*]] = icmp ult i64 [[IV_NEXT]], [[PTR_DIFF]] +; CHECK-NEXT: br i1 [[CMP_LATCH]], label %[[LOOP_HEADER]], label %[[EXIT_LOOPEXIT:.*]] +; CHECK: [[EXIT_EARLY]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT_LOOPEXIT]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +; N32-LABEL: define void @test_sub_cmp( +; N32-SAME: ptr align 8 [[START:%.*]], ptr [[END:%.*]]) { +; N32-NEXT: [[ENTRY:.*:]] +; N32-NEXT: [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64 +; N32-NEXT: [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64 +; N32-NEXT: [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]] +; N32-NEXT: [[CMP_ENTRY:%.*]] = icmp eq ptr [[START]], [[END]] +; N32-NEXT: br i1 [[CMP_ENTRY]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]] +; N32: [[LOOP_HEADER_PREHEADER]]: +; N32-NEXT: br label %[[LOOP_HEADER:.*]] +; N32: [[LOOP_HEADER]]: +; N32-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[LOOP_HEADER_PREHEADER]] ] +; N32-NEXT: [[C_1:%.*]] = call i1 @cond() +; N32-NEXT: br i1 [[C_1]], label %[[EXIT_EARLY:.*]], label %[[LOOP_LATCH]] +; N32: [[LOOP_LATCH]]: +; N32-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 +; N32-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[PTR_DIFF]] +; N32-NEXT: br i1 [[EXITCOND]], label %[[LOOP_HEADER]], label %[[EXIT_LOOPEXIT:.*]] +; N32: [[EXIT_EARLY]]: +; N32-NEXT: br label %[[EXIT]] +; N32: [[EXIT_LOOPEXIT]]: +; N32-NEXT: br label %[[EXIT]] +; N32: [[EXIT]]: +; N32-NEXT: ret void +; +entry: + %start.int = ptrtoint ptr %start to i64 + %end.int = ptrtoint ptr %end to i64 + %ptr.diff = sub i64 %start.int, %end.int + %cmp.entry = icmp eq ptr %start, %end + br i1 %cmp.entry, label %exit, label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %c.1 = call i1 @cond() + br i1 %c.1, label %exit.early, label %loop.latch + +loop.latch: + %iv.next = add i64 %iv, 1 + %cmp.latch = icmp ult i64 %iv.next, %ptr.diff + br i1 %cmp.latch, label %loop.header, label %exit + +exit.early: + br label %exit + +exit: + ret void +} + + +define void @test_ptr_diff_with_assume(ptr align 8 %start, ptr align 8 %end, ptr %P) { +; CHECK-LABEL: define void @test_ptr_diff_with_assume( +; CHECK-SAME: ptr align 8 [[START:%.*]], ptr align 8 [[END:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64 +; CHECK-NEXT: [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64 +; CHECK-NEXT: [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]] +; CHECK-NEXT: [[DIFF_CMP:%.*]] = icmp ult i64 [[PTR_DIFF]], 2 +; CHECK-NEXT: call void @llvm.assume(i1 [[DIFF_CMP]]) +; CHECK-NEXT: [[COMPUTED_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[PTR_DIFF]] +; CHECK-NEXT: [[ENTRY_CMP:%.*]] = icmp eq ptr [[START]], [[END]] +; CHECK-NEXT: br i1 [[ENTRY_CMP]], label %[[EXIT:.*]], label %[[LOOP_BODY_PREHEADER:.*]] +; CHECK: [[LOOP_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[LOOP_BODY:.*]] +; CHECK: [[LOOP_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_BODY]] ], [ [[START]], %[[LOOP_BODY_PREHEADER]] ] +; CHECK-NEXT: [[TMP0:%.*]] = call i1 @cond() +; CHECK-NEXT: [[IV_NEXT]] = getelementptr i8, ptr [[IV]], i64 1 +; CHECK-NEXT: [[LOOP_CMP:%.*]] = icmp eq ptr [[IV_NEXT]], [[COMPUTED_END]] +; CHECK-NEXT: br i1 [[LOOP_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_BODY]] +; CHECK: [[EXIT_LOOPEXIT]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +; N32-LABEL: define void @test_ptr_diff_with_assume( +; N32-SAME: ptr align 8 [[START:%.*]], ptr align 8 [[END:%.*]], ptr [[P:%.*]]) { +; N32-NEXT: [[ENTRY:.*:]] +; N32-NEXT: [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64 +; N32-NEXT: [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64 +; N32-NEXT: [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]] +; N32-NEXT: [[DIFF_CMP:%.*]] = icmp ult i64 [[PTR_DIFF]], 2 +; N32-NEXT: call void @llvm.assume(i1 [[DIFF_CMP]]) +; N32-NEXT: [[COMPUTED_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[PTR_DIFF]] +; N32-NEXT: [[ENTRY_CMP:%.*]] = icmp eq ptr [[START]], [[END]] +; N32-NEXT: br i1 [[ENTRY_CMP]], label %[[EXIT:.*]], label %[[LOOP_BODY_PREHEADER:.*]] +; N32: [[LOOP_BODY_PREHEADER]]: +; N32-NEXT: br label %[[LOOP_BODY:.*]] +; N32: [[LOOP_BODY]]: +; N32-NEXT: [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_BODY]] ], [ [[START]], %[[LOOP_BODY_PREHEADER]] ] +; N32-NEXT: [[TMP0:%.*]] = call i1 @cond() +; N32-NEXT: [[IV_NEXT]] = getelementptr i8, ptr [[IV]], i64 1 +; N32-NEXT: [[LOOP_CMP:%.*]] = icmp eq ptr [[IV_NEXT]], [[COMPUTED_END]] +; N32-NEXT: br i1 [[LOOP_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_BODY]] +; N32: [[EXIT_LOOPEXIT]]: +; N32-NEXT: br label %[[EXIT]] +; N32: [[EXIT]]: +; N32-NEXT: ret void +; +entry: + %start.int = ptrtoint ptr %start to i64 + %end.int = ptrtoint ptr %end to i64 + %ptr.diff = sub i64 %start.int, %end.int + %diff.cmp = icmp ult i64 %ptr.diff, 2 + call void @llvm.assume(i1 %diff.cmp) + %computed.end = getelementptr i8, ptr %start, i64 %ptr.diff + %entry.cmp = icmp eq ptr %start, %end + br i1 %entry.cmp, label %exit, label %loop.body + +loop.body: + %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop.body ] + call i1 @cond() + %iv.next = getelementptr i8, ptr %iv, i64 1 + %loop.cmp = icmp eq ptr %iv.next, %computed.end + br i1 %loop.cmp, label %exit, label %loop.body + +exit: + ret void +} diff --git a/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll b/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll new file mode 100644 index 0000000..b9c9228 --- /dev/null +++ b/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll @@ -0,0 +1,738 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=indvars < %s | FileCheck %s + +define void @optimize_trap(i32 %block_size) { +; CHECK-LABEL: define void @optimize_trap( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[BLOCK_SIZE]], -1 +; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 3, [[UMIN]] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br i1 [[TMP2]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @llvm.trap() +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: ; preds = %for.body.preheader, %if.end4 + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + call void @llvm.trap() + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @no_optimize_atomic(i32 %block_size) { +; CHECK-LABEL: define void @no_optimize_atomic( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @llvm.trap() +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store atomic i8 [[TMP4]], ptr [[ARRAYIDX7]] unordered, align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: ; preds = %for.body.preheader, %if.end4 + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + call void @llvm.trap() + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store atomic i8 %1, ptr %arrayidx7 unordered, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @no_optimize_volatile(i32 %block_size) { +; CHECK-LABEL: define void @no_optimize_volatile( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @llvm.trap() +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store volatile i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: ; preds = %for.body.preheader, %if.end4 + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + call void @llvm.trap() + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store volatile i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @no_optimize_call(i32 %block_size) { +; CHECK-LABEL: define void @no_optimize_call( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @llvm.trap() +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: call void @x(ptr null) +; CHECK-NEXT: store volatile i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: ; preds = %for.body.preheader, %if.end4 + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + call void @llvm.trap() + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + call void @x(ptr null) + store volatile i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @optimize_ubsan_trap(i32 %block_size) { +; CHECK-LABEL: define void @optimize_ubsan_trap( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[BLOCK_SIZE]], -1 +; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 3, [[UMIN]] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br i1 [[TMP2]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @llvm.ubsantrap(i8 1) +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: ; preds = %for.body.preheader, %if.end4 + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + call void @llvm.ubsantrap(i8 1) + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @no_optimize_arbitrary_call(i32 %block_size) { +; CHECK-LABEL: define void @no_optimize_arbitrary_call( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @noreturn_with_argmem(ptr [[FOO_ARR]]) +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: ; preds = %for.body.preheader, %if.end4 + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + call void @noreturn_with_argmem(ptr %foo_arr) + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @no_optimize_two_exits(i32 %block_size) { +; CHECK-LABEL: define void @no_optimize_two_exits( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[P:%.*]] = call i1 @pred() +; CHECK-NEXT: br i1 [[P]], label %[[FOR_BODY_CONT:.*]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; CHECK: [[FOR_BODY_CONT]]: +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @noreturn(ptr [[FOO_ARR]]) +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[TMP0]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %p = call i1 @pred() + br i1 %p, label %for.body.cont, label %for.cond.cleanup.loopexit + +for.body.cont: ; preds = %for.body.preheader, %if.end4 + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + call void @noreturn(ptr %foo_arr) + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @no_optimize_two_exits2(i32 %block_size) { +; CHECK-LABEL: define void @no_optimize_two_exits2( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_BODY_CONT:.*]] +; CHECK: [[FOR_BODY_CONT]]: +; CHECK-NEXT: [[P:%.*]] = call i1 @pred() +; CHECK-NEXT: br i1 [[P]], label %[[IF_END4]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @noreturn(ptr [[FOO_ARR]]) +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[TMP0]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %for.body.cont + +for.body.cont: ; preds = %for.body.preheader, %if.end4 + %p = call i1 @pred() + br i1 %p, label %if.end4, label %for.cond.cleanup.loopexit + +if.then: ; preds = %for.body + call void @noreturn(ptr %foo_arr) + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @no_optimize_depdendent_ubsan_trap(i32 %block_size) { +; CHECK-LABEL: define void @no_optimize_depdendent_ubsan_trap( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[I_015_LCSSA:%.*]] = phi i32 [ [[I_015]], %[[FOR_BODY]] ] +; CHECK-NEXT: call void @noreturn_with_i32(i32 [[I_015_LCSSA]]) +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[TMP0]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: ; preds = %for.body.preheader, %if.end4 + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + call void @noreturn_with_i32(i32 %i.015) + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @no_optimize_depdendent_load_trap(i32 %block_size) { +; CHECK-LABEL: define void @no_optimize_depdendent_load_trap( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[I_015_LCSSA:%.*]] = load i8, ptr [[FOO_ARR]], align 1 +; CHECK-NEXT: call void @noreturn_with_i8(i8 [[I_015_LCSSA]]) +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[TMP0]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: ; preds = %for.body.preheader, %if.end4 + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + %r = load i8, ptr %foo_arr, align 1 + call void @noreturn_with_i8(i8 %r) + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + + +declare void @x(ptr noundef) local_unnamed_addr +declare i1 @pred() local_unnamed_addr + +declare void @llvm.trap() #0 +declare void @noreturn(ptr) #0 +declare void @noreturn_with_i32(i32) #0 +declare void @noreturn_with_i8(i8) #0 +declare void @noreturn_with_argmem(ptr) #1 + +attributes #0 = { cold noreturn nounwind memory(inaccessiblemem: write) } +attributes #1 = { cold noreturn nounwind memory(argmem: read) } diff --git a/llvm/test/Transforms/Inline/ML/state-accounting-skip-non-cold.ll b/llvm/test/Transforms/Inline/ML/state-accounting-skip-non-cold.ll new file mode 100644 index 0000000..0887f5e --- /dev/null +++ b/llvm/test/Transforms/Inline/ML/state-accounting-skip-non-cold.ll @@ -0,0 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; REQUIRES: llvm_inliner_model_autogenerated && asserts +; RUN: opt -passes='default<O3>' -enable-ml-inliner=release -ml-inliner-skip-policy=if-caller-not-cold -S %s -o - | FileCheck %s +; RUN: opt -passes='default<O3>' -ml-inliner-stop-immediately -enable-ml-inliner=release -ml-inliner-skip-policy=if-caller-not-cold -S %s -o - | FileCheck %s + +declare ptr @f() + +define void @e() #0 { +; CHECK-LABEL: define void @e( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: tail call void @d() +; CHECK-NEXT: tail call void @g() +; CHECK-NEXT: tail call void @d() +; CHECK-NEXT: tail call void @g() +; CHECK-NEXT: tail call void @d() +; CHECK-NEXT: tail call void @g() +; CHECK-NEXT: ret void +; + call void @h() + call void @h() + call void @h() + ret void +} + +define void @d() { +; CHECK-LABEL: define void @d() local_unnamed_addr { +; CHECK-NEXT: tail call void @f() +; CHECK-NEXT: ret void +; + call void @f() + ret void +} + +define void @g() { +; CHECK-LABEL: define void @g() local_unnamed_addr { +; CHECK-NEXT: tail call void @f() +; CHECK-NEXT: ret void +; + call void @f() + ret void +} + +define void @h() #0 { +; CHECK-LABEL: define void @h( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: tail call void @d() +; CHECK-NEXT: tail call void @g() +; CHECK-NEXT: ret void +; + call void @d() + call void @g() + ret void +} + +attributes #0 = { "sign-return-address"="non-leaf" "sign-return-address-key"="a_key" } diff --git a/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll b/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll index 7cc4446..ad45d1e 100644 --- a/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll +++ b/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll @@ -11,10 +11,8 @@ define i16 @test5(i16 %A) !dbg !34 { call void @llvm.dbg.value(metadata i32 %C, metadata !37, metadata !DIExpression()), !dbg !41 ; Preserve the dbg.value for the DCE'd 32-bit 'and'. - ; - ; The high 16 bits of the original 'and' require sign-extending the new 16-bit and: ; CHECK-NEXT: #dbg_value(i16 [[and]], [[C:![0-9]+]], - ; CHECK-SAME: !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_signed, DW_OP_LLVM_convert, 32, DW_ATE_signed, DW_OP_stack_value) + ; CHECK-SAME: !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_stack_value) %D = trunc i32 %C to i16, !dbg !42 call void @llvm.dbg.value(metadata i16 %D, metadata !38, metadata !DIExpression()), !dbg !42 diff --git a/llvm/test/Transforms/InstCombine/icmp-trunc.ll b/llvm/test/Transforms/InstCombine/icmp-trunc.ll index b85deab..ad76ef7 100644 --- a/llvm/test/Transforms/InstCombine/icmp-trunc.ll +++ b/llvm/test/Transforms/InstCombine/icmp-trunc.ll @@ -3,6 +3,7 @@ ; RUN: opt < %s -passes=instcombine -S -data-layout="n8" | FileCheck %s --check-prefixes=CHECK,DL8 declare void @use(i8) +declare void @use2(i4) define i1 @ult_2(i32 %x) { ; CHECK-LABEL: @ult_2( @@ -785,3 +786,32 @@ define <2 x i1> @uge_nsw_non_splat(<2 x i32> %x) { ret <2 x i1> %r } +define i1 @trunc_icmp(i8 %a0) { +; CHECK-LABEL: @trunc_icmp( +; CHECK-NEXT: [[TZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0:%.*]], i1 false) +; CHECK-NEXT: [[TR:%.*]] = trunc nuw i8 [[TZ]] to i4 +; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[A0]], 0 +; CHECK-NEXT: call void @use2(i4 [[TR]]) +; CHECK-NEXT: ret i1 [[C]] +; + %tz = tail call range(i8 0, 9) i8 @llvm.cttz.i8(i8 %a0, i1 false) + %tr = trunc i8 %tz to i4 + %c = icmp eq i4 %tr, 8 + call void @use2(i4 %tr) + ret i1 %c +} + +define i1 @do_not_mask_trunc_eq_i32_i8(i32 %x) { +; DL64-LABEL: @do_not_mask_trunc_eq_i32_i8( +; DL64-NEXT: [[R:%.*]] = icmp eq i32 [[X:%.*]], 42 +; DL64-NEXT: ret i1 [[R]] +; +; DL8-LABEL: @do_not_mask_trunc_eq_i32_i8( +; DL8-NEXT: [[T:%.*]] = trunc nuw i32 [[X:%.*]] to i8 +; DL8-NEXT: [[R:%.*]] = icmp eq i8 [[T]], 42 +; DL8-NEXT: ret i1 [[R]] +; + %t = trunc nuw i32 %x to i8 + %r = icmp eq i8 %t, 42 + ret i1 %r +} diff --git a/llvm/test/Transforms/InstCombine/ptr-int-cast.ll b/llvm/test/Transforms/InstCombine/ptr-int-cast.ll index 69b8f69..82ecbd4 100644 --- a/llvm/test/Transforms/InstCombine/ptr-int-cast.ll +++ b/llvm/test/Transforms/InstCombine/ptr-int-cast.ll @@ -86,3 +86,14 @@ define <4 x ptr> @test7(<4 x i128> %arg) nounwind { %p1 = inttoptr <4 x i128> %arg to <4 x ptr> ret <4 x ptr> %p1 } + +define i64 @ptrtoint_gep_sub(ptr %ptr, i64 %end.addr) { +; CHECK-LABEL: @ptrtoint_gep_sub( +; CHECK-NEXT: ret i64 [[END_ADDR:%.*]] +; + %ptr.addr = ptrtoint ptr %ptr to i64 + %size = sub i64 %end.addr, %ptr.addr + %end = getelementptr i8, ptr %ptr, i64 %size + %end.addr2 = ptrtoint ptr %end to i64 + ret i64 %end.addr2 +} diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll b/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll index 7b30edb..71dad41 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -passes=instsimplify -S < %s | FileCheck %s +; RUN: opt -passes=instsimplify -use-constant-int-for-fixed-length-splat -S < %s | FileCheck %s ; Test that intrinsics wasm call are constant folded diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll b/llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll index 68b45a94..f68b85e 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s +; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat -S | FileCheck %s declare i31 @llvm.ctpop.i31(i31 %val) declare i32 @llvm.cttz.i32(i32 %val, i1) @@ -120,6 +121,22 @@ define <2 x i31> @ctpop_vector() { ret <2 x i31> %x } +define <2 x i31> @ctpop_vector_splat_v2i31() { +; CHECK-LABEL: @ctpop_vector_splat_v2i31( +; CHECK-NEXT: ret <2 x i31> splat (i31 1) +; + %x = call <2 x i31> @llvm.ctpop.v2i31(<2 x i31> splat(i31 16)) + ret <2 x i31> %x +} + +define <vscale x 2 x i31> @ctpop_vector_splat_nxv2i31() { +; CHECK-LABEL: @ctpop_vector_splat_nxv2i31( +; CHECK-NEXT: ret <vscale x 2 x i31> splat (i31 1) +; + %x = call <vscale x 2 x i31> @llvm.ctpop.nxv2i31(<vscale x 2 x i31> splat(i31 16)) + ret <vscale x 2 x i31> %x +} + define <2 x i31> @ctpop_vector_undef() { ; CHECK-LABEL: @ctpop_vector_undef( ; CHECK-NEXT: ret <2 x i31> zeroinitializer diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/bitreverse.ll b/llvm/test/Transforms/InstSimplify/ConstProp/bitreverse.ll new file mode 100644 index 0000000..409141a --- /dev/null +++ b/llvm/test/Transforms/InstSimplify/ConstProp/bitreverse.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -passes=instsimplify -S | FileCheck %s +; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat -S | FileCheck %s + +define i16 @W() { +; CHECK-LABEL: define i16 @W() { +; CHECK-NEXT: ret i16 -32768 +; + %Z = call i16 @llvm.bitreverse.i16(i16 1) + ret i16 %Z +} + +define i32 @X() { +; CHECK-LABEL: define i32 @X() { +; CHECK-NEXT: ret i32 -2147483648 +; + %Z = call i32 @llvm.bitreverse.i32(i32 1) + ret i32 %Z +} + +define i64 @Y() { +; CHECK-LABEL: define i64 @Y() { +; CHECK-NEXT: ret i64 -9223372036854775808 +; + %Z = call i64 @llvm.bitreverse.i64(i64 1) + ret i64 %Z +} + +define i80 @Z() { +; CHECK-LABEL: define i80 @Z() { +; CHECK-NEXT: ret i80 23777929115895377691656 +; + %Z = call i80 @llvm.bitreverse.i80(i80 76151636403560493650080) + ret i80 %Z +} + +define <4 x i32> @bitreverse_splat_v4i32() { +; CHECK-LABEL: define <4 x i32> @bitreverse_splat_v4i32() { +; CHECK-NEXT: ret <4 x i32> splat (i32 -2147483648) +; + %Z = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> splat(i32 1)) + ret <4 x i32> %Z +} + +define <vscale x 4 x i32> @bitreverse_splat_nxv4i32() { +; CHECK-LABEL: define <vscale x 4 x i32> @bitreverse_splat_nxv4i32() { +; CHECK-NEXT: ret <vscale x 4 x i32> splat (i32 -2147483648) +; + %Z = call <vscale x 4 x i32> @llvm.bitreverse.v4i32(<vscale x 4 x i32> splat(i32 1)) + ret <vscale x 4 x i32> %Z +} diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll b/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll index 42bb733..4db8ced 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll @@ -2,6 +2,7 @@ ; bswap should be constant folded when it is passed a constant argument ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s +; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat -S | FileCheck %s declare i16 @llvm.bswap.i16(i16) @@ -42,3 +43,19 @@ define i80 @Z() { %Z = call i80 @llvm.bswap.i80( i80 76151636403560493650080 ) ret i80 %Z } + +define <4 x i32> @bswap_splat_v4i32() { +; CHECK-LABEL: define <4 x i32> @bswap_splat_v4i32() { +; CHECK-NEXT: ret <4 x i32> splat (i32 16777216) +; + %Z = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> splat(i32 1)) + ret <4 x i32> %Z +} + +define <vscale x 4 x i32> @bswap_splat_nxv4i32() { +; CHECK-LABEL: define <vscale x 4 x i32> @bswap_splat_nxv4i32() { +; CHECK-NEXT: ret <vscale x 4 x i32> splat (i32 16777216) +; + %Z = call <vscale x 4 x i32> @llvm.bswap.v4i32(<vscale x 4 x i32> splat(i32 1)) + ret <vscale x 4 x i32> %Z +} diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll index e994921..9f9e3f9 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s +; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -S | FileCheck %s declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a) declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a) diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll index ed9fba3..22ab79d 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll @@ -289,6 +289,225 @@ define void @interleave_wide_nxdouble_factor2(ptr %ptr, <vscale x 4 x double> %l ret void } +define void @deinterleave1_nxi64_factor3(ptr %ptr, <vscale x 4 x i64>* %s1, <vscale x 4 x i64>* %s2, <vscale x 4 x i64>* %s3) #0 { +; CHECK-LABEL: define void @deinterleave1_nxi64_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[LDN1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3 +; CHECK-NEXT: [[LDN2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP9]], i64 2) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP11]], i64 2) +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP13]], i64 2) +; CHECK-NEXT: [[TMP15:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } poison, <vscale x 4 x i64> [[TMP10]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP15]], <vscale x 4 x i64> [[TMP12]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP16]], <vscale x 4 x i64> [[TMP14]], 2 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 2 +; CHECK-NEXT: store <vscale x 4 x i64> [[TMP18]], ptr [[S1]], align 32 +; CHECK-NEXT: store <vscale x 4 x i64> [[TMP19]], ptr [[S2]], align 32 +; CHECK-NEXT: store <vscale x 4 x i64> [[TMP20]], ptr [[S3]], align 32 +; CHECK-NEXT: ret void +; + %wide.vec = load <vscale x 12 x i64>, ptr %ptr, align 8 + %ldN = tail call { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave3.nxv12i64(<vscale x 12 x i64> %wide.vec) + + %3 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 0 + %4 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 1 + %5 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 2 + + store <vscale x 4 x i64> %3, <vscale x 4 x i64>* %s1 + store <vscale x 4 x i64> %4, <vscale x 4 x i64>* %s2 + store <vscale x 4 x i64> %5, <vscale x 4 x i64>* %s3 + ret void +} + +define void @deinterleave2_nxi64_factor3(ptr %ptr, <vscale x 8 x i64>* %s1, <vscale x 8 x i64>* %s2, <vscale x 8 x i64>* %s3) #0 { +; CHECK-LABEL: define void @deinterleave2_nxi64_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[LDN1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3 +; CHECK-NEXT: [[LDN2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP9]], i64 2) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP11]], i64 2) +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP13]], i64 2) +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 6 +; CHECK-NEXT: [[LDN3:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP15]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP16]], i64 4) +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP18]], i64 4) +; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 2 +; CHECK-NEXT: [[TMP21:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP20]], i64 4) +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 9 +; CHECK-NEXT: [[LDN4:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP22]]) +; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP17]], <vscale x 2 x i64> [[TMP23]], i64 6) +; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 1 +; CHECK-NEXT: [[TMP26:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP19]], <vscale x 2 x i64> [[TMP25]], i64 6) +; CHECK-NEXT: [[TMP27:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 2 +; CHECK-NEXT: [[TMP28:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP21]], <vscale x 2 x i64> [[TMP27]], i64 6) +; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } poison, <vscale x 8 x i64> [[TMP24]], 0 +; CHECK-NEXT: [[TMP30:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP29]], <vscale x 8 x i64> [[TMP26]], 1 +; CHECK-NEXT: [[TMP31:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP30]], <vscale x 8 x i64> [[TMP28]], 2 +; CHECK-NEXT: [[TMP32:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 0 +; CHECK-NEXT: [[TMP33:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 1 +; CHECK-NEXT: [[TMP34:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 2 +; CHECK-NEXT: store <vscale x 8 x i64> [[TMP32]], ptr [[S1]], align 64 +; CHECK-NEXT: store <vscale x 8 x i64> [[TMP33]], ptr [[S2]], align 64 +; CHECK-NEXT: store <vscale x 8 x i64> [[TMP34]], ptr [[S3]], align 64 +; CHECK-NEXT: ret void +; + %wide.vec = load <vscale x 24 x i64>, ptr %ptr, align 8 + %ldN = tail call { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } @llvm.vector.deinterleave3.nxv24i64(<vscale x 24 x i64> %wide.vec) + + %3 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 0 + %4 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 1 + %5 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 2 + + store <vscale x 8 x i64> %3, <vscale x 8 x i64>* %s1 + store <vscale x 8 x i64> %4, <vscale x 8 x i64>* %s2 + store <vscale x 8 x i64> %5, <vscale x 8 x i64>* %s3 + ret void +} + +define void @deinterleave_neg1_nxi64_factor3(ptr %ptr, <vscale x 1 x i64>* %s1, <vscale x 1 x i64>* %s2, <vscale x 1 x i64>* %s3) #0 { +; CHECK-LABEL: define void @deinterleave_neg1_nxi64_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 3 x i64>, ptr [[PTR]], align 8 +; CHECK-NEXT: [[LDN:%.*]] = tail call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64> [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 2 +; CHECK-NEXT: store <vscale x 1 x i64> [[TMP1]], ptr [[S1]], align 8 +; CHECK-NEXT: store <vscale x 1 x i64> [[TMP2]], ptr [[S2]], align 8 +; CHECK-NEXT: store <vscale x 1 x i64> [[TMP3]], ptr [[S3]], align 8 +; CHECK-NEXT: ret void +; + %wide.vec = load <vscale x 3 x i64>, ptr %ptr, align 8 + %ldN = tail call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64> %wide.vec) + + %3 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 0 + %4 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 1 + %5 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 2 + + store <vscale x 1 x i64> %3, <vscale x 1 x i64>* %s1 + store <vscale x 1 x i64> %4, <vscale x 1 x i64>* %s2 + store <vscale x 1 x i64> %5, <vscale x 1 x i64>* %s3 + ret void +} + +define void @deinterleave_neg2_nxi8_factor3(ptr %ptr, <vscale x 8 x i8>* %s1, <vscale x 8 x i8>* %s2, <vscale x 8 x i8>* %s3) #0 { +; CHECK-LABEL: define void @deinterleave_neg2_nxi8_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 24 x i8>, ptr [[PTR]], align 8 +; CHECK-NEXT: [[LDN:%.*]] = tail call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv24i8(<vscale x 24 x i8> [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 2 +; CHECK-NEXT: store <vscale x 8 x i8> [[TMP1]], ptr [[S1]], align 8 +; CHECK-NEXT: store <vscale x 8 x i8> [[TMP2]], ptr [[S2]], align 8 +; CHECK-NEXT: store <vscale x 8 x i8> [[TMP3]], ptr [[S3]], align 8 +; CHECK-NEXT: ret void +; + %wide.vec = load <vscale x 24 x i8>, ptr %ptr, align 8 + %ldN = tail call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv12i8(<vscale x 24 x i8> %wide.vec) + + %3 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 0 + %4 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 1 + %5 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 2 + + store <vscale x 8 x i8> %3, <vscale x 8 x i8>* %s1 + store <vscale x 8 x i8> %4, <vscale x 8 x i8>* %s2 + store <vscale x 8 x i8> %5, <vscale x 8 x i8>* %s3 + ret void +} + +define void @interleave1_nxi64_factor3(ptr %ptr, <vscale x 8 x i64> %s1, <vscale x 8 x i64> %s2, <vscale x 8 x i64> %s3) #0 { +; CHECK-LABEL: define void @interleave1_nxi64_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 8 x i64> [[S1:%.*]], <vscale x 8 x i64> [[S2:%.*]], <vscale x 8 x i64> [[S3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3 +; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 2) +; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 2) +; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP5]]) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 6 +; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 4) +; CHECK-NEXT: [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 4) +; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 4) +; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP9]]) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 9 +; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 6) +; CHECK-NEXT: [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 6) +; CHECK-NEXT: [[TMP16:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 6) +; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP13]]) +; CHECK-NEXT: ret void +; + %interleave = tail call <vscale x 24 x i64> @llvm.vector.interleave3.nxv24i64(<vscale x 8 x i64> %s1, <vscale x 8 x i64> %s2, <vscale x 8 x i64> %s3) + + store <vscale x 24 x i64> %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave2_nxi64_factor3(ptr %ptr, <vscale x 4 x i64> %s1, <vscale x 4 x i64> %s2, <vscale x 4 x i64> %s3) #0 { +; CHECK-LABEL: define void @interleave2_nxi64_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 4 x i64> [[S1:%.*]], <vscale x 4 x i64> [[S2:%.*]], <vscale x 4 x i64> [[S3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S1]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S3]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3 +; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S1]], i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S2]], i64 2) +; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S3]], i64 2) +; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP5]]) +; CHECK-NEXT: ret void +; + %interleave = tail call <vscale x 12 x i64> @llvm.vector.interleave3.nxv12i64(<vscale x 4 x i64> %s1, <vscale x 4 x i64> %s2, <vscale x 4 x i64> %s3) + + store <vscale x 12 x i64> %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave_neg_nxi8_factor3(ptr %ptr, <vscale x 8 x i8> %s1, <vscale x 8 x i8> %s2, <vscale x 8 x i8> %s3) #0 { +; CHECK-LABEL: define void @interleave_neg_nxi8_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 8 x i8> [[S1:%.*]], <vscale x 8 x i8> [[S2:%.*]], <vscale x 8 x i8> [[S3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[INTERLEAVE:%.*]] = tail call <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8> [[S1]], <vscale x 8 x i8> [[S2]], <vscale x 8 x i8> [[S3]]) +; CHECK-NEXT: store <vscale x 24 x i8> [[INTERLEAVE]], ptr [[PTR]], align 4 +; CHECK-NEXT: ret void +; + %interleave = tail call <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8> %s1, <vscale x 8 x i8> %s2, <vscale x 8 x i8> %s3) + + store <vscale x 24 x i8> %interleave, ptr %ptr, align 4 + ret void +} + declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>) declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>) declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>) @@ -312,4 +531,15 @@ declare <vscale x 4 x ptr> @llvm.vector.interleave2.nxv4p0(<vscale x 2 x ptr>, < ; Larger interleaves to test 'legalization' declare <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>) +; De-Interleaves with Factor=3 +declare { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64>) +declare { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave3.nxv12i64(<vscale x 12 x i64>) +declare { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } @llvm.vector.deinterleave3.nxv24i64(<vscale x 24 x i64>) +declare { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv24i8(<vscale x 24 x i8>) + +; Interleaves with Factor=3 +declare <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>) +declare <vscale x 24 x i64> @llvm.vector.interleave3.nxv24i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64>) +declare <vscale x 12 x i64> @llvm.vector.interleave3.nxv12i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64>) + attributes #0 = { vscale_range(1,16) "target-features"="+sve" } diff --git a/llvm/test/Transforms/LoopPredication/preserve-bpi.ll b/llvm/test/Transforms/LoopPredication/preserve-bpi.ll deleted file mode 100644 index 7fbb197..0000000 --- a/llvm/test/Transforms/LoopPredication/preserve-bpi.ll +++ /dev/null @@ -1,60 +0,0 @@ -; RUN: opt -mtriple=x86_64 -passes='loop-mssa(loop-predication,licm,simple-loop-unswitch<nontrivial>,loop-simplifycfg)' -debug-pass-manager -debug-only=branch-prob -S < %s 2>&1 | FileCheck %s - -; REQUIRES: asserts - -; This test is to solely check that we do not run BPI every single time loop -; predication is invoked (since BPI is preserved as part of -; LoopStandardAnalysisResults). -declare void @llvm.experimental.guard(i1, ...) - -; CHECK: Running pass: LoopPredicationPass on loop -; CHECK-NEXT: Running pass: LICMPass on loop -; CHECK-NEXT: Running pass: SimpleLoopUnswitchPass on loop -; CHECK-NEXT: Running analysis: OuterAnalysisManagerProxy -; CHECK-NEXT: Running pass: LoopPredicationPass on loop -; CHECK-NEXT: Running pass: LICMPass on loop -; CHECK-NEXT: Running pass: SimpleLoopUnswitchPass on loop -; CHECK-NEXT: Running pass: LoopSimplifyCFGPass on loop - -define i32 @unsigned_loop_0_to_n_ult_check(ptr %array, i32 %length, i32 %n) { -entry: - %tmp5 = icmp eq i32 %n, 0 - br i1 %tmp5, label %exit, label %loop.preheader - -loop.preheader: ; preds = %entry - br label %loop - -loop: ; preds = %guarded, %loop.preheader - %loop.acc = phi i32 [ %loop.acc.next, %guarded ], [ 0, %loop.preheader ] - %i = phi i32 [ %i.next, %guarded ], [ 0, %loop.preheader ] - %within.bounds = icmp ult i32 %i, %length - %widenable_cond = call i1 @llvm.experimental.widenable.condition() - %exiplicit_guard_cond = and i1 %within.bounds, %widenable_cond - br i1 %exiplicit_guard_cond, label %guarded, label %deopt, !prof !0 - -deopt: ; preds = %loop - %deoptcall = call i32 (...) @llvm.experimental.deoptimize.i32(i32 9) [ "deopt"() ] - ret i32 %deoptcall - -guarded: ; preds = %loop - %i.i64 = zext i32 %i to i64 - %array.i.ptr = getelementptr inbounds i32, ptr %array, i64 %i.i64 - %array.i = load i32, ptr %array.i.ptr, align 4 - %loop.acc.next = add i32 %loop.acc, %array.i - %i.next = add nuw i32 %i, 1 - %continue = icmp ult i32 %i.next, %n - br i1 %continue, label %loop, label %exit, !prof !2 - -exit: ; preds = %guarded, %entry - %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %guarded ] - ret i32 %result -} - -declare i32 @llvm.experimental.deoptimize.i32(...) -declare i1 @llvm.experimental.widenable.condition() #0 - -attributes #0 = { inaccessiblememonly nounwind } - -!0 = !{!"branch_weights", i32 1048576, i32 1} -!1 = !{i32 1, i32 -2147483648} -!2 = !{!"branch_weights", i32 1024, i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll index 279d4e8..83623fd 100644 --- a/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll +++ b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll @@ -5,8 +5,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ;CHECK-LABEL: @foo( ;CHECK: icmp eq <4 x i32> ;CHECK: select <4 x i1> -;CHECK: ret i32 -define i32 @foo(i32 %x, i32 %t, ptr nocapture %A) nounwind uwtable ssp { +;CHECK: ret void +define void @foo(i32 %x, i32 %t, ptr nocapture %A) nounwind uwtable ssp { entry: %cmp10 = icmp sgt i32 %x, 0 br i1 %cmp10, label %for.body, label %for.end @@ -35,5 +35,5 @@ if.end: ; preds = %for.body, %if.then br i1 %exitcond, label %for.end, label %for.body for.end: ; preds = %if.end, %entry - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index 6cf11be..6fe6883 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -660,16 +660,17 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) { ; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1 ; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]] ; COMMON: [[PRED_STORE_CONTINUE12]]: -; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT1:.*]] +; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]] ; COMMON: [[PRED_STORE_IF13]]: ; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7 ; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1 -; COMMON-NEXT: br label %[[EXIT1]] +; COMMON-NEXT: br label %[[EXIT]] +; COMMON: [[EXIT]]: +; COMMON-NEXT: br label %[[SCALAR_PH:.*]] +; COMMON: [[SCALAR_PH]]: +; COMMON-NEXT: br label %[[EXIT1:.*]] ; COMMON: [[EXIT1]]: -; COMMON-NEXT: br label %[[SCALAR_PH1:.*]] -; COMMON: [[SCALAR_PH1]]: -; COMMON-NEXT: br [[EXIT:label %.*]] -; COMMON: [[SCALAR_PH:.*:]] +; COMMON-NEXT: ret void ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll index 93e71af..e3e4833 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll @@ -42,7 +42,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3 ; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -80,7 +80,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3 ; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP17]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]] ; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]]) ; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]] @@ -104,7 +104,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3 ; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17 -; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ] ; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]] @@ -167,13 +167,13 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no ; CHECK-NEXT: [[TMP11]] = or <16 x i64> [[VEC_PHI]], [[TMP10]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]]) ; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -211,7 +211,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no ; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP17]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]] ; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]]) ; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]] @@ -235,7 +235,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no ; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17 -; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ] ; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]] @@ -308,7 +308,7 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks( ; CHECK-NEXT: store i64 0, ptr [[L]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4 -; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH:.*]] ; CHECK: [[SCALAR_PH]]: @@ -332,7 +332,7 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks( ; CHECK-NEXT: store i64 0, ptr [[L]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 2 ; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV1]], 14 -; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll index e424649..75b18ff 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll @@ -541,3 +541,22 @@ exit: ; preds = %for.body ; CHECK-VS1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK-VS1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK-VS1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-VS1: [[PROF3]] = !{!"branch_weights", i32 8, i32 8} +; CHECK-VS1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-VS1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} +; CHECK-VS1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-VS1: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]} +; CHECK-VS1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-VS1: [[PROF9]] = !{!"branch_weights", i32 10, i32 30} +;. +; CHECK-VS2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-VS2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-VS2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-VS2: [[PROF3]] = !{!"branch_weights", i32 8, i32 8} +; CHECK-VS2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-VS2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} +; CHECK-VS2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-VS2: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]} +; CHECK-VS2: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-VS2: [[PROF9]] = !{!"branch_weights", i32 10, i32 30} +;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll index a6e0f8a..300f5d9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll @@ -40,6 +40,7 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali ; CHECK-ARMPL: [[ENTRY:.*:]] ; CHECK-ARMPL: [[VECTOR_PH:.*:]] ; CHECK-ARMPL: [[VECTOR_BODY:.*:]] +; CHECK-ARMPL: [[VECTOR_BODY1:.*:]] ; CHECK-ARMPL: [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]]) ; CHECK-ARMPL: [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]]) ; CHECK-ARMPL: [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0 @@ -53,6 +54,15 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali ; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]] ; CHECK-ARMPL: [[SCALAR_PH:.*:]] ; CHECK-ARMPL: [[FOR_BODY:.*:]] +; CHECK-ARMPL: [[VEC_EPILOG_VECTOR_BODY:.*:]] +; CHECK-ARMPL: [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> [[WIDE_LOAD3:%.*]]) +; CHECK-ARMPL: [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0 +; CHECK-ARMPL: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1 +; CHECK-ARMPL: store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4 +; CHECK-ARMPL: store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4 +; CHECK-ARMPL: [[VEC_EPILOG_MIDDLE_BLOCK:.*:]] +; CHECK-ARMPL: [[VEC_EPILOG_SCALAR_PH:.*:]] +; CHECK-ARMPL: [[FOR_BODY1:.*:]] ; CHECK-ARMPL: [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]]) ; CHECK-ARMPL: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0 ; CHECK-ARMPL: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1 @@ -262,6 +272,7 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias ; CHECK-ARMPL: [[ENTRY:.*:]] ; CHECK-ARMPL: [[VECTOR_PH:.*:]] ; CHECK-ARMPL: [[VECTOR_BODY:.*:]] +; CHECK-ARMPL: [[VECTOR_BODY1:.*:]] ; CHECK-ARMPL: [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.modf.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]]) ; CHECK-ARMPL: [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.modf.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]]) ; CHECK-ARMPL: [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0 @@ -275,6 +286,15 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias ; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]] ; CHECK-ARMPL: [[SCALAR_PH:.*:]] ; CHECK-ARMPL: [[FOR_BODY:.*:]] +; CHECK-ARMPL: [[VEC_EPILOG_VECTOR_BODY:.*:]] +; CHECK-ARMPL: [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.modf.v4f32(<4 x float> [[WIDE_LOAD3:%.*]]) +; CHECK-ARMPL: [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0 +; CHECK-ARMPL: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1 +; CHECK-ARMPL: store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4 +; CHECK-ARMPL: store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4 +; CHECK-ARMPL: [[VEC_EPILOG_MIDDLE_BLOCK:.*:]] +; CHECK-ARMPL: [[VEC_EPILOG_SCALAR_PH:.*:]] +; CHECK-ARMPL: [[FOR_BODY1:.*:]] ; CHECK-ARMPL: [[CALL:%.*]] = tail call { float, float } @llvm.modf.f32(float [[IN_VAL:%.*]]) ; CHECK-ARMPL: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0 ; CHECK-ARMPL: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1 @@ -412,6 +432,7 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa ; CHECK-ARMPL: [[ENTRY:.*:]] ; CHECK-ARMPL: [[VECTOR_PH:.*:]] ; CHECK-ARMPL: [[VECTOR_BODY:.*:]] +; CHECK-ARMPL: [[VECTOR_BODY1:.*:]] ; CHECK-ARMPL: [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincospi.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]]) ; CHECK-ARMPL: [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincospi.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]]) ; CHECK-ARMPL: [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0 @@ -425,6 +446,15 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa ; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]] ; CHECK-ARMPL: [[SCALAR_PH:.*:]] ; CHECK-ARMPL: [[FOR_BODY:.*:]] +; CHECK-ARMPL: [[VEC_EPILOG_VECTOR_BODY:.*:]] +; CHECK-ARMPL: [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.sincospi.v4f32(<4 x float> [[WIDE_LOAD3:%.*]]) +; CHECK-ARMPL: [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0 +; CHECK-ARMPL: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1 +; CHECK-ARMPL: store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4 +; CHECK-ARMPL: store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4 +; CHECK-ARMPL: [[VEC_EPILOG_MIDDLE_BLOCK:.*:]] +; CHECK-ARMPL: [[VEC_EPILOG_SCALAR_PH:.*:]] +; CHECK-ARMPL: [[FOR_BODY1:.*:]] ; CHECK-ARMPL: [[CALL:%.*]] = tail call { float, float } @llvm.sincospi.f32(float [[IN_VAL:%.*]]) ; CHECK-ARMPL: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0 ; CHECK-ARMPL: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll index ab9b48f..aff2c4c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll @@ -153,17 +153,20 @@ define void @uniform_gep_for_replicating_gep(ptr %dst) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2) ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[STEP_ADD]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[VEC_IND]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i32> [[STEP_ADD]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = lshr i32 [[INDEX]], 1 ; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP2]], 1 ; CHECK-NEXT: [[TMP11:%.*]] = zext <2 x i1> [[TMP5]] to <2 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = zext <2 x i1> [[TMP3]] to <2 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP8]] to i64 ; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP9]] to i64 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP14]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i8> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1 ; CHECK-NEXT: store i8 [[TMP22]], ptr [[TMP18]], align 1 -; CHECK-NEXT: store i8 [[TMP22]], ptr [[TMP19]], align 1 +; CHECK-NEXT: store i8 [[TMP12]], ptr [[TMP19]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2) ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll index 8830ce3..5f79d02 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll @@ -38,8 +38,9 @@ define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out ; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true ; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[EXIT:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: br label %for.body @@ -96,8 +97,9 @@ define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out ; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true ; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[EXIT:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll index d447517..f03f743 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll @@ -29,8 +29,9 @@ define void @trip7_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: [[COND:%.*]] = xor i1 [[EXTRACT_FIRST_LANE_MASK]], true ; CHECK-NEXT: br i1 [[COND]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[EXIT:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll index b8f4e84..753847f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll @@ -33,8 +33,9 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) # ; CHECK-NEXT: [[FIRST_LANE_SET:%.*]] = xor i1 [[EXTRACT_FIRST_LANE_MASK]], true ; CHECK-NEXT: br i1 [[FIRST_LANE_SET]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[FOR_END:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[FOR_END:.*]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret void ; entry: @@ -87,8 +88,9 @@ define void @cond_uniform_load(ptr noalias nocapture %dst, ptr nocapture readonl ; CHECK-NEXT: [[TMP9:%.*]] = xor i1 [[TMP8]], true ; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[FOR_END:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[FOR_END:.*]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret void ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll index 596e42e..d0c1194 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll @@ -36,7 +36,7 @@ for.body: ; preds = %entry, %for.body br i1 %exitcond.not, label %exit, label %for.body } -define i32 @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 { +define void @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 { ; CHECK-COST-2: LV: Found an estimated cost of 0 for VF 1 For instruction: %i.032 = phi i32 [ %inc, %if.end ], [ %start, %for.body.preheader ] ; CHECK-COST-2-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.032 @@ -70,7 +70,7 @@ for.cond.cleanup.loopexit: ; preds = %if.end br label %for.cond.cleanup for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry - ret i32 undef + ret void for.body: ; preds = %for.body.preheader, %if.end %i.032 = phi i32 [ %inc, %if.end ], [ %start, %for.body.preheader ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll index e046816..e84c0d6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll @@ -67,7 +67,7 @@ define void @test_may_clobber(ptr %p) { ; CHECK-NEXT: store <4 x i64> [[WIDE_LOAD]], ptr [[TMP4]], align 32 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: exit: @@ -111,7 +111,7 @@ define void @trivial_due_max_vscale(ptr %p) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: exit: @@ -155,7 +155,7 @@ define void @no_high_lmul_or_interleave(ptr %p) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: exit: @@ -207,7 +207,7 @@ define void @safe_load_store_distance_not_pow_of_2(i64 %N) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 24) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -221,7 +221,7 @@ define void @safe_load_store_distance_not_pow_of_2(i64 %N) { ; CHECK-NEXT: store i16 0, ptr [[GEP_OFF]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 3 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll index 2fbc73e..c66d8d6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll @@ -133,7 +133,7 @@ define void @single_constant_stride_int_iv(ptr %p) { ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: exit: @@ -237,7 +237,7 @@ define void @single_constant_stride_ptr_iv(ptr %p) { ; CHECK-NEXT: [[TMP12:%.*]] = mul i64 8, [[TMP9]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: exit: @@ -346,7 +346,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] ; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; NOSTRIDED: middle.block: ; NOSTRIDED-NEXT: br label [[EXIT:%.*]] ; NOSTRIDED: scalar.ph: @@ -360,7 +360,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 ; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; NOSTRIDED: exit: ; NOSTRIDED-NEXT: ret void ; @@ -468,7 +468,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] ; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; NOSTRIDED: middle.block: ; NOSTRIDED-NEXT: br label [[EXIT:%.*]] ; NOSTRIDED: scalar.ph: @@ -483,7 +483,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]] ; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; NOSTRIDED: exit: ; NOSTRIDED-NEXT: ret void ; @@ -640,7 +640,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]] ; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] ; NOSTRIDED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; NOSTRIDED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; NOSTRIDED: middle.block: ; NOSTRIDED-NEXT: br label [[EXIT:%.*]] ; NOSTRIDED: scalar.ph: @@ -656,7 +656,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: store i32 [[Y0]], ptr [[Q1]], align 4 ; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; NOSTRIDED: exit: ; NOSTRIDED-NEXT: ret void ; @@ -790,14 +790,14 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT9]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer ; STRIDED-NEXT: [[TMP18:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT1]] ; STRIDED-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP18]] -; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META6:![0-9]+]] +; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META5:![0-9]+]] ; STRIDED-NEXT: [[TMP20:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1) ; STRIDED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP18]] -; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META9:![0-9]+]], !noalias [[META6]] +; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META8:![0-9]+]], !noalias [[META5]] ; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP44]] ; STRIDED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; STRIDED-NEXT: [[TMP41:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; STRIDED-NEXT: br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; STRIDED-NEXT: br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; STRIDED: middle.block: ; STRIDED-NEXT: br label [[EXIT:%.*]] ; STRIDED: scalar.ph: @@ -813,7 +813,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: store i32 [[Y0]], ptr [[Q1]], align 4 ; STRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; STRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] +; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] ; STRIDED: exit: ; STRIDED-NEXT: ret void ; @@ -965,7 +965,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] ; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; NOSTRIDED: middle.block: ; NOSTRIDED-NEXT: br label [[EXIT:%.*]] ; NOSTRIDED: scalar.ph: @@ -981,7 +981,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]] ; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; NOSTRIDED: exit: ; NOSTRIDED-NEXT: ret void ; @@ -1145,16 +1145,16 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI11]], <vscale x 4 x i64> [[TMP18]] ; STRIDED-NEXT: [[VECTOR_GEP7:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 4 x i64> [[TMP18]] ; STRIDED-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[VECTOR_GEP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META13:![0-9]+]] +; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[VECTOR_GEP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META12:![0-9]+]] ; STRIDED-NEXT: [[TMP30:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1) -; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP30]], <vscale x 4 x ptr> align 4 [[VECTOR_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META16:![0-9]+]], !noalias [[META13]] +; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP30]], <vscale x 4 x ptr> align 4 [[VECTOR_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META15:![0-9]+]], !noalias [[META12]] ; STRIDED-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 ; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] ; STRIDED-NEXT: [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP16]] ; STRIDED-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP25]] ; STRIDED-NEXT: [[PTR_IND12]] = getelementptr i8, ptr [[POINTER_PHI11]], i64 [[TMP25]] ; STRIDED-NEXT: [[TMP23:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; STRIDED-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; STRIDED-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; STRIDED: middle.block: ; STRIDED-NEXT: br label [[EXIT:%.*]] ; STRIDED: scalar.ph: @@ -1170,7 +1170,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[PTR2_NEXT]] = getelementptr inbounds i8, ptr [[PTR2]], i64 [[STRIDE]] ; STRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; STRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] +; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP18:![0-9]+]] ; STRIDED: exit: ; STRIDED-NEXT: ret void ; @@ -1318,7 +1318,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) { ; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]] ; NOSTRIDED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; NOSTRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; NOSTRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; NOSTRIDED: middle.block: ; NOSTRIDED-NEXT: br label [[LOOP:%.*]] ; NOSTRIDED: exit: @@ -1402,7 +1402,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) { ; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]] ; STRIDED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; STRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; STRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; STRIDED: middle.block: ; STRIDED-NEXT: br label [[LOOP:%.*]] ; STRIDED: exit: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll index 0c22a9e..46daee4 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll @@ -142,7 +142,7 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]] ; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]]) ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] @@ -267,7 +267,7 @@ define void @store_factor_4_with_gap(i32 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]] ; IF-EVL-NEXT: [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: exit: @@ -382,7 +382,7 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]] ; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]]) ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] @@ -508,7 +508,7 @@ define void @store_factor_4_with_tail_gap(i32 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]] ; IF-EVL-NEXT: [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: exit: @@ -621,7 +621,7 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]]) ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index bae97e5..c34417b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -129,7 +129,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; SCALABLE-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] @@ -143,7 +143,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALABLE: [[FOR_END]]: ; SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ] ; SCALABLE-NEXT: ret i64 [[V_LCSSA]] @@ -204,7 +204,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; TF-SCALABLE-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP3]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] ; TF-SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] @@ -218,7 +218,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; TF-SCALABLE-NEXT: store i64 [[V1]], ptr [[ARRAYIDX1]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TF-SCALABLE: [[FOR_END]]: ; TF-SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V1]], %[[FOR_BODY]] ], [ [[V]], %[[MIDDLE_BLOCK]] ] ; TF-SCALABLE-NEXT: ret i64 [[V_LCSSA]] @@ -269,7 +269,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]] ; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -350,7 +350,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]] ; TF-SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: @@ -399,7 +399,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -457,7 +457,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]] ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: @@ -499,7 +499,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] ; SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -557,7 +557,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]] ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: @@ -608,7 +608,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]] ; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -679,7 +679,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: @@ -731,7 +731,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] ; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -812,7 +812,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; TF-SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: @@ -860,7 +860,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] ; SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -918,7 +918,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]] ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll index b106f99..1153d18 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll @@ -6,7 +6,7 @@ ; Check that the addresses for a scalarized memory access is not extracted ; from a vector register. -define i32 @foo(ptr nocapture %A) { +define void @foo(ptr nocapture %A) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] @@ -27,7 +27,7 @@ define i32 @foo(ptr nocapture %A) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.end: -; CHECK-NEXT: ret i32 poison +; CHECK-NEXT: ret void ; entry: @@ -44,12 +44,12 @@ for.body: br i1 %exitcond, label %for.end, label %for.body for.end: - ret i32 poison + ret void } ; Check that a load of address is scalarized. -define i32 @foo1(ptr nocapture noalias %A, ptr nocapture %PtrPtr) { +define void @foo1(ptr nocapture noalias %A, ptr nocapture %PtrPtr) { ; CHECK-LABEL: @foo1( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] @@ -74,7 +74,7 @@ define i32 @foo1(ptr nocapture noalias %A, ptr nocapture %PtrPtr) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.end: -; CHECK-NEXT: ret i32 poison +; CHECK-NEXT: ret void ; entry: @@ -93,5 +93,5 @@ for.body: br i1 %exitcond, label %for.end, label %for.body for.end: - ret i32 poison + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/X86/avx1.ll b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll index 9e20586..44fb8cb 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/avx1.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll @@ -6,8 +6,8 @@ target triple = "x86_64-apple-macosx10.8.0" ; CHECK-LABEL: @read_mod_write_single_ptr( ; CHECK: load <8 x float> -; CHECK: ret i32 -define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp { +; CHECK: ret void +define void @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -23,15 +23,15 @@ define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable br i1 %exitcond, label %._crit_edge, label %.lr.ph ._crit_edge: ; preds = %.lr.ph, %0 - ret i32 undef + ret void } ; CHECK-LABEL: @read_mod_i64( ; SLOWMEM32: load <2 x i64> ; FASTMEM32: load <4 x i64> -; CHECK: ret i32 -define i32 @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp { +; CHECK: ret void +define void @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -47,6 +47,6 @@ define i32 @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp { br i1 %exitcond, label %._crit_edge, label %.lr.ph ._crit_edge: ; preds = %.lr.ph, %0 - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll index e11b1ad..27d5e64 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -166,7 +166,6 @@ attributes #0 = { "target-cpu"="knl" } ; CHECK: LV: Found uniform instruction: {{%.*}} = icmp eq i32 {{%.*}}, 0 ; CHECK-NOT: LV: Found uniform instruction: {{%.*}} = load i32, ptr {{%.*}}, align 1 ; CHECK: LV: Found not uniform due to requiring predication: {{%.*}} = load i32, ptr {{%.*}}, align 1 -; CHECK: LV: Found scalar instruction: {{%.*}} = getelementptr inbounds [3 x i32], ptr @a, i32 0, i32 {{%.*}} ; ; @a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1 @@ -215,8 +214,9 @@ define void @PR40816() #1 { ; FORCE-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4 ; FORCE-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; FORCE: [[MIDDLE_BLOCK]]: -; FORCE-NEXT: br [[RETURN:label %.*]] -; FORCE: [[SCALAR_PH:.*:]] +; FORCE-NEXT: br label %[[RETURN:.*]] +; FORCE: [[RETURN]]: +; FORCE-NEXT: ret void ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll index 6d2cda4..0287645 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll @@ -4,7 +4,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp { +define void @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp { ; CHECK-LABEL: @conversion_cost1( ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 3 ; CHECK-NEXT: br i1 [[TMP1]], label [[ITER_CHECK:%.*]], label [[DOT_CRIT_EDGE:%.*]] @@ -37,7 +37,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END5:%.*]] = add i64 3, [[N_VEC]] ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -58,7 +58,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT9]] = add <4 x i8> [[VEC_IND8]], splat (i8 4) ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N12]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -73,11 +73,11 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: ._crit_edge.loopexit: ; CHECK-NEXT: br label [[DOT_CRIT_EDGE]] ; CHECK: ._crit_edge: -; CHECK-NEXT: ret i32 undef +; CHECK-NEXT: ret void ; %1 = icmp sgt i32 %n, 3 br i1 %1, label %.lr.ph, label %._crit_edge @@ -93,10 +93,10 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin br i1 %exitcond, label %._crit_edge, label %.lr.ph ._crit_edge: ; preds = %.lr.ph, %0 - ret i32 undef + ret void } -define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp { +define void @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp { ; CHECK-LABEL: @conversion_cost2( ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 9 ; CHECK-NEXT: br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]] @@ -136,7 +136,7 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD_3]], splat (i64 2) ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -152,11 +152,11 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: ._crit_edge.loopexit: ; CHECK-NEXT: br label [[DOT_CRIT_EDGE]] ; CHECK: ._crit_edge: -; CHECK-NEXT: ret i32 undef +; CHECK-NEXT: ret void ; %1 = icmp sgt i32 %n, 9 br i1 %1, label %.lr.ph, label %._crit_edge @@ -173,5 +173,5 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin br i1 %exitcond, label %._crit_edge, label %.lr.ph ._crit_edge: ; preds = %.lr.ph, %0 - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index 9453ad7..725fa49 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -540,6 +540,8 @@ define i64 @cost_assume(ptr %end, i64 %N) { ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i64 [[N:%.*]], 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -551,14 +553,6 @@ define i64 @cost_assume(ptr %end, i64 %N) { ; CHECK-NEXT: [[TMP8]] = add <2 x i64> [[VEC_PHI2]], splat (i64 1) ; CHECK-NEXT: [[TMP9]] = add <2 x i64> [[VEC_PHI3]], splat (i64 1) ; CHECK-NEXT: [[TMP10]] = add <2 x i64> [[VEC_PHI4]], splat (i64 1) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll index af5c921..fa3b4a66 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll @@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux" ;CHECK-LABEL: func1x6( ;CHECK: <4 x i32> ;CHECK: ret -define i32 @func1x6(ptr nocapture %out, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) { +define void @func1x6(ptr nocapture %out, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) { entry: br label %for.body @@ -40,14 +40,14 @@ for.body: ; preds = %for.body, %entry br i1 %exitcond, label %for.end, label %for.body for.end: ; preds = %for.body - ret i32 undef + ret void } ; We are vectorizing with 12 runtime checks. ;CHECK-LABEL: func2x6( ;CHECK: <4 x i32> ;CHECK: ret -define i32 @func2x6(ptr nocapture %out, ptr nocapture %out2, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) { +define void @func2x6(ptr nocapture %out, ptr nocapture %out2, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) { entry: br label %for.body @@ -85,5 +85,5 @@ for.body: ; preds = %for.body, %entry br i1 %exitcond, label %for.end, label %for.body for.end: ; preds = %for.body - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll index 8971dfe..47355e7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll @@ -11,7 +11,7 @@ target triple = "x86_64-apple-macosx10.8.0" ;CHECK-NOUNRL: store <4 x i32> ;CHECK-NOUNRL-NOT: store <4 x i32> ;CHECK-NOUNRL: ret -define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp { +define void @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -27,5 +27,5 @@ define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp { br i1 %exitcond, label %._crit_edge, label %.lr.ph ._crit_edge: ; preds = %.lr.ph, %0 - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll index 28de5c7..56f0b85 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -58,7 +58,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF5:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] @@ -72,7 +72,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly ; CHECK-NEXT: store <4 x float> [[TMP21]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 4 ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 20 -; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: @@ -88,7 +88,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly ; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -132,14 +132,14 @@ define void @vectorized1(ptr noalias nocapture %A, ptr noalias nocapture readonl ; CHECK-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 19) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7:![0-9]+]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP8:![0-9]+]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP8]] ; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP7]] +; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP8]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.end: @@ -180,14 +180,14 @@ define void @vectorized2(ptr noalias nocapture %A, ptr noalias nocapture readonl ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP7]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP8]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP7]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP8]] ; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP7]] +; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP8]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/assume.ll b/llvm/test/Transforms/LoopVectorize/assume.ll index 65c12a1..224ec4a6 100644 --- a/llvm/test/Transforms/LoopVectorize/assume.ll +++ b/llvm/test/Transforms/LoopVectorize/assume.ll @@ -34,8 +34,9 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600 ; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[FOR_END:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[FOR_END:.*]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret void ; entry: br label %for.body @@ -73,29 +74,28 @@ define void @test2(ptr noalias %a, ptr noalias %b) { ; CHECK-NEXT: [[MASKCOND4:%.*]] = icmp eq i64 [[MASKEDPTR3]], 0 ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 2 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[WIDE_LOAD]], splat (float 1.000000e+00) ; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 2 ; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[TMP7]], align 4 ; CHECK-NEXT: store <2 x float> [[TMP6]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600 -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[FOR_END:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[FOR_END:.*]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret void ; entry: %ptrint = ptrtoint ptr %a to i64 @@ -163,7 +163,7 @@ define void @predicated_assume(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2) ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], [[FOR_COND_CLEANUP_LOOPEXIT:label %.*]], label %[[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll index f64255f..b7aa958 100644 --- a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll +++ b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll @@ -8,7 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; When scalarizing stores we need to preserve the original order. ; Make sure that we are extracting in the correct order (0101, and not 0011). -define i32 @foo(ptr nocapture %A) { +define void @foo(ptr nocapture %A) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] @@ -39,7 +39,7 @@ define i32 @foo(ptr nocapture %A) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.end: -; CHECK-NEXT: ret i32 undef +; CHECK-NEXT: ret void ; entry: br label %for.body @@ -55,7 +55,7 @@ for.body: br i1 %exitcond, label %for.end, label %for.body for.end: - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll index 1588d02..51255b2 100644 --- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll +++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll @@ -3,7 +3,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) { +define void @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -73,7 +73,7 @@ define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) { ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: -; CHECK-NEXT: ret i32 undef +; CHECK-NEXT: ret void ; entry: %cmp26 = icmp sgt i32 %n, 0 @@ -106,11 +106,11 @@ if.end14: br i1 %exitcond, label %for.end, label %for.body for.end: - ret i32 undef + ret void } ; As above but with multiple variables set per block. -define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) { +define void @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) { ; CHECK-LABEL: @multi_variable_if_nest( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -188,7 +188,7 @@ define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) { ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: -; CHECK-NEXT: ret i32 undef +; CHECK-NEXT: ret void ; entry: %cmp26 = icmp sgt i32 %n, 0 @@ -224,5 +224,5 @@ if.end14: br i1 %exitcond, label %for.end, label %for.body for.end: - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion.ll b/llvm/test/Transforms/LoopVectorize/if-conversion.ll index 8a7f4a3..a88a9b14 100644 --- a/llvm/test/Transforms/LoopVectorize/if-conversion.ll +++ b/llvm/test/Transforms/LoopVectorize/if-conversion.ll @@ -17,8 +17,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; } ;} -define i32 @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp { -; CHECK-LABEL: define i32 @function0( +define void @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp { +; CHECK-LABEL: define void @function0( ; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[START:%.*]], i32 [[END:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CMP16:%.*]] = icmp slt i32 [[START]], [[END]] @@ -94,7 +94,7 @@ define i32 @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end) ; CHECK: [[FOR_END_LOOPEXIT]]: ; CHECK-NEXT: br label %[[FOR_END]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: ret i32 undef +; CHECK-NEXT: ret void ; entry: %cmp16 = icmp slt i32 %start, %end @@ -127,7 +127,7 @@ if.end: br i1 %cmp, label %for.body, label %for.end for.end: - ret i32 undef + ret void } @@ -237,6 +237,8 @@ for.end: ; preds = %for.inc, %entry ; Handle PHI with single incoming value having a full mask. ; PR34523 +; NOTE: Changing PHI inputs from undef to poison leads to change in +; behaviour of the test. Left as undef for now. define void @PR34523() { ; CHECK-LABEL: define void @PR34523() { ; CHECK-NEXT: [[BB1:.*:]] diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll index 742ee64..eea2237 100644 --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -337,7 +337,7 @@ for.end: ; preds = %for.body ; } ; } -define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 { +define void @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 { ; CHECK-LABEL: @multiple_uniform_stores( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0 @@ -429,7 +429,7 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly ; CHECK: for.end10.loopexit: ; CHECK-NEXT: br label [[FOR_END10]] ; CHECK: for.end10: -; CHECK-NEXT: ret i32 undef +; CHECK-NEXT: ret void ; entry: %cmp20 = icmp eq i32 %itr, 0 @@ -469,12 +469,12 @@ for.inc8: ; preds = %for.body3, %for.con br i1 %exitcond26, label %for.end10, label %for.cond1.preheader for.end10: ; preds = %for.inc8, %entry - ret i32 undef + ret void } ; second uniform store to the same address is conditional. ; we do not vectorize this. -define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 { +define void @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 { ; CHECK-LABEL: @multiple_uniform_stores_conditional( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0 @@ -520,7 +520,7 @@ define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocaptu ; CHECK: for.end10.loopexit: ; CHECK-NEXT: br label [[FOR_END10]] ; CHECK: for.end10: -; CHECK-NEXT: ret i32 undef +; CHECK-NEXT: ret void ; entry: %cmp20 = icmp eq i32 %itr, 0 @@ -567,7 +567,7 @@ for.inc8: ; preds = %for.body3, %for.con br i1 %exitcond26, label %for.end10, label %for.cond1.preheader for.end10: ; preds = %for.inc8, %entry - ret i32 undef + ret void } ; cannot vectorize loop with unsafe dependency between uniform load (%i10) and store diff --git a/llvm/test/Transforms/LoopVectorize/memdep.ll b/llvm/test/Transforms/LoopVectorize/memdep.ll index b891b43..d9d9eec 100644 --- a/llvm/test/Transforms/LoopVectorize/memdep.ll +++ b/llvm/test/Transforms/LoopVectorize/memdep.ll @@ -132,7 +132,7 @@ for.end: ; CHECK-LABEL: @f6 ; CHECK-NOT: <2 x i32> -define i32 @f6(ptr %a, i32 %tmp) { +define void @f6(ptr %a, i32 %tmp) { entry: br label %for.body @@ -149,7 +149,7 @@ for.body: br i1 %exitcond, label %for.body, label %for.end for.end: - ret i32 undef + ret void } ; Don't vectorize true loop carried dependencies that are not a multiple of the diff --git a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll index 1533906..53dad3a 100644 --- a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll +++ b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll @@ -74,8 +74,7 @@ exit: ret void } -; FIXME: Currently this mis-compiled when interleaving; all stores store the -; last lane of the last part, instead of the last lane per part. +; Check each unrolled store stores the last lane of the corresponding part. ; Test case for https://github.com/llvm/llvm-project/issues/162498. define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(ptr %dst) { ; VF4IC1-LABEL: define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts( @@ -121,13 +120,15 @@ define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts( ; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF2IC2: [[VECTOR_BODY]]: ; VF2IC2-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2IC2-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 0 +; VF2IC2-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 1 ; VF2IC2-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 2 ; VF2IC2-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 3 -; VF2IC2-NEXT: [[TMP2:%.*]] = lshr i32 [[INDEX]], 1 +; VF2IC2-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP7]], 1 ; VF2IC2-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP0]], 1 ; VF2IC2-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP2]] ; VF2IC2-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP3]] -; VF2IC2-NEXT: store i32 [[TMP1]], ptr [[TMP4]], align 4 +; VF2IC2-NEXT: store i32 [[TMP8]], ptr [[TMP4]], align 4 ; VF2IC2-NEXT: store i32 [[TMP1]], ptr [[TMP5]], align 4 ; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; VF2IC2-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100 diff --git a/llvm/test/Transforms/LoopVectorize/operand-bundles.ll b/llvm/test/Transforms/LoopVectorize/operand-bundles.ll new file mode 100644 index 0000000..ce07364 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/operand-bundles.ll @@ -0,0 +1,227 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck %s + +define void @call_loop_invariant_operand_bundle(ptr %dst, {float, float} %sv) { +; CHECK-LABEL: define void @call_loop_invariant_operand_bundle( +; CHECK-SAME: ptr [[DST:%.*]], { float, float } [[SV:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { float, float } [[SV]], 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { float, float } [[SV]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[DST]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT2]]) [ "deopt"(float 1.000000e+01) ] +; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %a = extractvalue { float, float } %sv, 0 + %b = extractvalue { float, float } %sv, 1 + %addr = getelementptr float, ptr %dst, i32 %iv + %p = call float @llvm.pow.f32(float %a, float %b) [ "deopt"(float 10.0) ] + store float %p, ptr %addr + %iv.next = add nsw i32 %iv, 1 + %cond = icmp ne i32 %iv.next, 1000 + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +define void @call_unknown_operand_bundle(ptr %dst, {float, float} %sv) { +; CHECK-LABEL: define void @call_unknown_operand_bundle( +; CHECK-SAME: ptr [[DST:%.*]], { float, float } [[SV:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { float, float } [[SV]], 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { float, float } [[SV]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[DST]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT2]]) [ "unknown"(ptr null) ] +; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %a = extractvalue { float, float } %sv, 0 + %b = extractvalue { float, float } %sv, 1 + %addr = getelementptr float, ptr %dst, i32 %iv + %p = call float @llvm.pow.f32(float %a, float %b) [ "unknown"(ptr null) ] + store float %p, ptr %addr + %iv.next = add nsw i32 %iv, 1 + %cond = icmp ne i32 %iv.next, 1000 + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +define void @call_cold_operand_bundle(ptr %dst, {float, float} %sv) { +; CHECK-LABEL: define void @call_cold_operand_bundle( +; CHECK-SAME: ptr [[DST:%.*]], { float, float } [[SV:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { float, float } [[SV]], 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { float, float } [[SV]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[DST]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT2]]) [ "cold"() ] +; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %a = extractvalue { float, float } %sv, 0 + %b = extractvalue { float, float } %sv, 1 + %addr = getelementptr float, ptr %dst, i32 %iv + %p = call float @llvm.pow.f32(float %a, float %b) [ "cold"() ] + store float %p, ptr %addr + %iv.next = add nsw i32 %iv, 1 + %cond = icmp ne i32 %iv.next, 1000 + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +define void @assume_loop_variant_operand_bundle(ptr noalias %a, ptr noalias %b) { +; CHECK-LABEL: define void @assume_loop_variant_operand_bundle( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP8]], align 4 +; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP0]]) ] +; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP1]]) ] +; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP2]]) ] +; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP3]]) ] +; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+00) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP10]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600 +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds float, ptr %b, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %cmp1 = fcmp ogt float %0, 1.000000e+02 + tail call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 %iv) ] + %add = fadd float %0, 1.000000e+00 + %arrayidx5 = getelementptr inbounds float, ptr %a, i64 %iv + store float %add, ptr %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv, 1599 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @assume_cold_operand_bundle(ptr noalias %a, ptr noalias %b) { +; CHECK-LABEL: define void @assume_cold_operand_bundle( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "cold"() ] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+00) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[TMP2]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600 +; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds float, ptr %b, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %cmp1 = fcmp ogt float %0, 1.000000e+02 + tail call void @llvm.assume(i1 true) [ "cold"() ] + %add = fadd float %0, 1.000000e+00 + %arrayidx5 = getelementptr inbounds float, ptr %a, i64 %iv + store float %add, ptr %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv, 1599 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll b/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll index d700d48..f5e480c 100644 --- a/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll +++ b/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll @@ -10,7 +10,7 @@ ; CHECK: store i64 %indvars.outer, ptr %O2, align 4 -define i64 @foo(ptr nocapture %A, ptr nocapture %B, i64 %n, i64 %m, ptr %O1, ptr %O2) { +define void @foo(ptr nocapture %A, ptr nocapture %B, i64 %n, i64 %m, ptr %O1, ptr %O2) { entry: %cmp = icmp sgt i64 %n, 0 br i1 %cmp, label %for.body.outer.preheader, label %for.end.outer @@ -50,5 +50,5 @@ for.end.outer.loopexit: ; preds = %for.end.inner br label %for.end.outer for.end.outer: ; preds = %for.end.outer.loopexit, %entry - ret i64 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/pr28541.ll b/llvm/test/Transforms/LoopVectorize/pr28541.ll index ad7f6e7..0a9c8c1 100644 --- a/llvm/test/Transforms/LoopVectorize/pr28541.ll +++ b/llvm/test/Transforms/LoopVectorize/pr28541.ll @@ -28,7 +28,7 @@ ; CHECK-NOT: vectorized loop ; CHECK-LABEL: fn1 -define i32 @fn1() { +define void @fn1() { entry: %tmp2 = load i32, ptr @b, align 4 %dec3 = add nsw i32 %tmp2, -1 @@ -67,5 +67,5 @@ while.cond.while.end_crit_edge: ; preds = %while.cond br label %while.end while.end: ; preds = %while.cond.while.end_crit_edge, %entry - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/pr48832.ll b/llvm/test/Transforms/LoopVectorize/pr48832.ll index b89be88..c6ebe85 100644 --- a/llvm/test/Transforms/LoopVectorize/pr48832.ll +++ b/llvm/test/Transforms/LoopVectorize/pr48832.ll @@ -23,7 +23,7 @@ for.body: ; preds = %for.cond br i1 true, label %cond.false, label %land.rhs land.rhs: ; preds = %for.body - br i1 poison, label %cond.end, label %cond.false + br i1 false, label %cond.end, label %cond.false cond.false: ; preds = %for.body, %land.rhs br label %cond.end diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll index f87be5a..6ea227f 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll @@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; a[i] = b[i] * 3; ; } -define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp { +define void @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64, !dbg [[DBG4:![0-9]+]] @@ -58,7 +58,7 @@ define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END]], !dbg [[DBG14:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: ret i32 undef, !dbg [[DBG14]] +; CHECK-NEXT: ret void, !dbg [[DBG14]] ; ; FORCED_OPTSIZE-LABEL: @foo( ; FORCED_OPTSIZE-NEXT: entry: @@ -80,7 +80,7 @@ define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp ; FORCED_OPTSIZE: for.end.loopexit: ; FORCED_OPTSIZE-NEXT: br label [[FOR_END]], !dbg [[DBG10:![0-9]+]] ; FORCED_OPTSIZE: for.end: -; FORCED_OPTSIZE-NEXT: ret i32 undef, !dbg [[DBG10]] +; FORCED_OPTSIZE-NEXT: ret void, !dbg [[DBG10]] ; entry: %cmp6 = icmp sgt i32 %n, 0, !dbg !6 @@ -99,7 +99,7 @@ for.body: ; preds = %entry, %for.body br i1 %exitcond, label %for.end, label %for.body, !dbg !7 for.end: ; preds = %for.body, %entry - ret i32 undef, !dbg !8 + ret void, !dbg !8 } ; Make sure that we try to vectorize loops with a runtime check if the @@ -505,11 +505,11 @@ define void @test_scev_check_mul_add_expansion(ptr %out, ptr %in, i32 %len, i32 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], 6 ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[OFFSET_IDX]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[OUT]], i64 [[TMP6]] -; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2, !alias.scope [[META37:![0-9]+]], !noalias [[META40:![0-9]+]] -; CHECK-NEXT: store i32 0, ptr [[IN]], align 4, !alias.scope [[META40]] +; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2, !alias.scope [[META36:![0-9]+]], !noalias [[META39:![0-9]+]] +; CHECK-NEXT: store i32 0, ptr [[IN]], align 4, !alias.scope [[META39]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -524,7 +524,7 @@ define void @test_scev_check_mul_add_expansion(ptr %out, ptr %in, i32 %len, i32 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: store i32 0, ptr [[IN]], align 4 ; CHECK-NEXT: [[CMP7_NOT:%.*]] = icmp sgt i32 [[LEN]], [[IV_NEXT]] -; CHECK-NEXT: br i1 [[CMP7_NOT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP43:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP7_NOT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP42:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll index ad8cd42..667df3a 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll @@ -88,11 +88,11 @@ define void @test2(ptr %a, ptr noalias %b) { ; CHECK-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1600, [[TMP7]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1600, [[N_MOD_VF]] +; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 1 @@ -101,8 +101,6 @@ define void @test2(ptr %a, ptr noalias %b) { ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x float>, ptr [[TMP13]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD]], splat (float 1.000000e+00) ; CHECK-NEXT: [[TMP15:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD3]], splat (float 1.000000e+00) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/write-only.ll b/llvm/test/Transforms/LoopVectorize/write-only.ll index cc21b94..8df71e83 100644 --- a/llvm/test/Transforms/LoopVectorize/write-only.ll +++ b/llvm/test/Transforms/LoopVectorize/write-only.ll @@ -4,8 +4,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ;CHECK-LABEL: @read_mod_write_single_ptr( ;CHECK: load <4 x float> -;CHECK: ret i32 -define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp { +;CHECK: ret void +define void @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -21,14 +21,14 @@ define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable br i1 %exitcond, label %._crit_edge, label %.lr.ph ._crit_edge: ; preds = %.lr.ph, %0 - ret i32 undef + ret void } ; Ensure that volatile stores are not vectorized. ; CHECK-LABEL: @read_mod_write_single_ptr_volatile_store( ; CHECK-NOT: store <4 x float> -; CHECK: ret i32 -define i32 @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) nounwind uwtable ssp { +; CHECK: ret void +define void @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -44,5 +44,5 @@ define i32 @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) n br i1 %exitcond, label %._crit_edge, label %.lr.ph ._crit_edge: ; preds = %.lr.ph, %0 - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll index d281905..abd1d96 100644 --- a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:128:128' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR128 ; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:64:64' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR64 ; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:32:32' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR32 @@ -10,179 +9,6 @@ target triple = "aarch64-unknown-unknown" define void @multiply(ptr %A, ptr %B, ptr %C) { -; PTR128-LABEL: @multiply( -; PTR128-NEXT: entry: -; PTR128-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i128 -; PTR128-NEXT: [[STORE_END:%.*]] = add nuw nsw i128 [[STORE_BEGIN]], 128 -; PTR128-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i128 -; PTR128-NEXT: [[TMP0:%.*]] = icmp ugt i128 [[STORE_END]], [[LOAD_BEGIN]] -; PTR128-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]] -; PTR128: alias_cont: -; PTR128-NEXT: [[LOAD_END:%.*]] = add nuw nsw i128 [[LOAD_BEGIN]], 128 -; PTR128-NEXT: [[TMP1:%.*]] = icmp ugt i128 [[LOAD_END]], [[STORE_BEGIN]] -; PTR128-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]] -; PTR128: copy: -; PTR128-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8 -; PTR128-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false) -; PTR128-NEXT: br label [[NO_ALIAS]] -; PTR128: no_alias: -; PTR128-NEXT: [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ] -; PTR128-NEXT: [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i128 -; PTR128-NEXT: [[STORE_END5:%.*]] = add nuw nsw i128 [[STORE_BEGIN4]], 128 -; PTR128-NEXT: [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i128 -; PTR128-NEXT: [[TMP4:%.*]] = icmp ugt i128 [[STORE_END5]], [[LOAD_BEGIN6]] -; PTR128-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]] -; PTR128: alias_cont1: -; PTR128-NEXT: [[LOAD_END7:%.*]] = add nuw nsw i128 [[LOAD_BEGIN6]], 128 -; PTR128-NEXT: [[TMP5:%.*]] = icmp ugt i128 [[LOAD_END7]], [[STORE_BEGIN4]] -; PTR128-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]] -; PTR128: copy2: -; PTR128-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8 -; PTR128-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false) -; PTR128-NEXT: br label [[NO_ALIAS3]] -; PTR128: no_alias3: -; PTR128-NEXT: [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ] -; PTR128-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8 -; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32 -; PTR128-NEXT: [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 -; PTR128-NEXT: [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8 -; PTR128-NEXT: [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32 -; PTR128-NEXT: [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8 -; PTR128-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer -; PTR128-NEXT: [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]] -; PTR128-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; PTR128-NEXT: [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]]) -; PTR128-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer -; PTR128-NEXT: [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]] -; PTR128-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; PTR128-NEXT: [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]]) -; PTR128-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64 -; PTR128-NEXT: [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8 -; PTR128-NEXT: [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96 -; PTR128-NEXT: [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8 -; PTR128-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16 -; PTR128-NEXT: [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8 -; PTR128-NEXT: [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48 -; PTR128-NEXT: [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8 -; PTR128-NEXT: [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer -; PTR128-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]]) -; PTR128-NEXT: [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; PTR128-NEXT: [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]]) -; PTR128-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer -; PTR128-NEXT: [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]]) -; PTR128-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; PTR128-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]]) -; PTR128-NEXT: store <2 x double> [[TMP15]], ptr [[C]], align 8 -; PTR128-NEXT: [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i128 32 -; PTR128-NEXT: store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8 -; PTR128-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16 -; PTR128-NEXT: [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8 -; PTR128-NEXT: [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48 -; PTR128-NEXT: [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8 -; PTR128-NEXT: [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8 -; PTR128-NEXT: [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32 -; PTR128-NEXT: [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8 -; PTR128-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer -; PTR128-NEXT: [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]] -; PTR128-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; PTR128-NEXT: [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]]) -; PTR128-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer -; PTR128-NEXT: [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]] -; PTR128-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; PTR128-NEXT: [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]]) -; PTR128-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80 -; PTR128-NEXT: [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8 -; PTR128-NEXT: [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112 -; PTR128-NEXT: [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8 -; PTR128-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16 -; PTR128-NEXT: [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8 -; PTR128-NEXT: [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48 -; PTR128-NEXT: [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8 -; PTR128-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer -; PTR128-NEXT: [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]]) -; PTR128-NEXT: [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; PTR128-NEXT: [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]]) -; PTR128-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer -; PTR128-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]]) -; PTR128-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; PTR128-NEXT: [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]]) -; PTR128-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i128 16 -; PTR128-NEXT: store <2 x double> [[TMP26]], ptr [[TMP29]], align 8 -; PTR128-NEXT: [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i128 48 -; PTR128-NEXT: store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8 -; PTR128-NEXT: [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8 -; PTR128-NEXT: [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32 -; PTR128-NEXT: [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8 -; PTR128-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64 -; PTR128-NEXT: [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8 -; PTR128-NEXT: [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96 -; PTR128-NEXT: [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8 -; PTR128-NEXT: [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer -; PTR128-NEXT: [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]] -; PTR128-NEXT: [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; PTR128-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]]) -; PTR128-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer -; PTR128-NEXT: [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]] -; PTR128-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; PTR128-NEXT: [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]]) -; PTR128-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64 -; PTR128-NEXT: [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8 -; PTR128-NEXT: [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96 -; PTR128-NEXT: [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8 -; PTR128-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80 -; PTR128-NEXT: [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8 -; PTR128-NEXT: [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112 -; PTR128-NEXT: [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8 -; PTR128-NEXT: [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer -; PTR128-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]]) -; PTR128-NEXT: [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; PTR128-NEXT: [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]]) -; PTR128-NEXT: [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer -; PTR128-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]]) -; PTR128-NEXT: [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; PTR128-NEXT: [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]]) -; PTR128-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i128 64 -; PTR128-NEXT: store <2 x double> [[TMP38]], ptr [[TMP41]], align 8 -; PTR128-NEXT: [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i128 96 -; PTR128-NEXT: store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8 -; PTR128-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16 -; PTR128-NEXT: [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8 -; PTR128-NEXT: [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48 -; PTR128-NEXT: [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8 -; PTR128-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64 -; PTR128-NEXT: [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8 -; PTR128-NEXT: [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96 -; PTR128-NEXT: [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8 -; PTR128-NEXT: [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer -; PTR128-NEXT: [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]] -; PTR128-NEXT: [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; PTR128-NEXT: [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]]) -; PTR128-NEXT: [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer -; PTR128-NEXT: [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]] -; PTR128-NEXT: [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; PTR128-NEXT: [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]]) -; PTR128-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80 -; PTR128-NEXT: [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8 -; PTR128-NEXT: [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112 -; PTR128-NEXT: [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8 -; PTR128-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80 -; PTR128-NEXT: [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8 -; PTR128-NEXT: [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112 -; PTR128-NEXT: [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8 -; PTR128-NEXT: [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer -; PTR128-NEXT: [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]]) -; PTR128-NEXT: [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; PTR128-NEXT: [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]]) -; PTR128-NEXT: [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer -; PTR128-NEXT: [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]]) -; PTR128-NEXT: [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; PTR128-NEXT: [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]]) -; PTR128-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i128 80 -; PTR128-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8 -; PTR128-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i128 112 -; PTR128-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8 -; PTR128-NEXT: ret void -; ; PTR64-LABEL: @multiply( ; PTR64-NEXT: entry: ; PTR64-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i64 diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll index 87def6b..3d05014 100644 --- a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:128:128' -S < %s | FileCheck %s --check-prefix=PTR128 ; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:64:64' -S < %s | FileCheck %s --check-prefix=PTR64 ; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:32:32' -S < %s | FileCheck %s --check-prefix=PTR32 @@ -7,128 +6,13 @@ ; the need to emit `libc` calls), we perform strided index calculations using ; the same pointer bit-width as the matrix pointers, as determined by the data ; layout. To verify this behaviour, this test runs several strided loads and -; stores through the lowering pass with (32|64|128)-bit pointers, and verifies -; the generated code extends / truncates strides accordingly. Similarly, +; stores through the lowering pass with (32|64)-bit pointers, and verifies the +; generated code extends / truncates strides accordingly. Similarly, ; `data-layout-multiply-fused.ll` adopts this approach to verify the same ; behaviour for index calculations emitted while lowering fused matrix ; multiplies. -define <9 x double> @strided_load_3x3_i128(ptr %in, i128 %stride) { -; PTR128-LABEL: @strided_load_3x3_i128( -; PTR128-NEXT: entry: -; PTR128-NEXT: [[VEC_START:%.*]] = mul i128 0, [[STRIDE:%.*]] -; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]] -; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 -; PTR128-NEXT: [[VEC_START1:%.*]] = mul i128 1, [[STRIDE]] -; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]] -; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 -; PTR128-NEXT: [[VEC_START4:%.*]] = mul i128 2, [[STRIDE]] -; PTR128-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]] -; PTR128-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8 -; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> -; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> -; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> -; PTR128-NEXT: ret <9 x double> [[TMP2]] -; -; PTR64-LABEL: @strided_load_3x3_i128( -; PTR64-NEXT: entry: -; PTR64-NEXT: [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i64 -; PTR64-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]] -; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]] -; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 -; PTR64-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]] -; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]] -; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 -; PTR64-NEXT: [[VEC_START4:%.*]] = mul i64 2, [[STRIDE_CAST]] -; PTR64-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]] -; PTR64-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8 -; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> -; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> -; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> -; PTR64-NEXT: ret <9 x double> [[TMP2]] -; -; PTR32-LABEL: @strided_load_3x3_i128( -; PTR32-NEXT: entry: -; PTR32-NEXT: [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i32 -; PTR32-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE_CAST]] -; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]] -; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 -; PTR32-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE_CAST]] -; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]] -; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 -; PTR32-NEXT: [[VEC_START4:%.*]] = mul i32 2, [[STRIDE_CAST]] -; PTR32-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]] -; PTR32-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8 -; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> -; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> -; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> -; PTR32-NEXT: ret <9 x double> [[TMP2]] -; -entry: - %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 %stride, i1 false, i32 3, i32 3) - ret <9 x double> %load -} - -define <9 x double> @strided_load_3x3_const_stride_i128(ptr %in) { -; PTR128-LABEL: @strided_load_3x3_const_stride_i128( -; PTR128-NEXT: entry: -; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8 -; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16 -; PTR128-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 -; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32 -; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 -; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> -; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> -; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> -; PTR128-NEXT: ret <9 x double> [[TMP2]] -; -; PTR64-LABEL: @strided_load_3x3_const_stride_i128( -; PTR64-NEXT: entry: -; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8 -; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16 -; PTR64-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 -; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32 -; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 -; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> -; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> -; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> -; PTR64-NEXT: ret <9 x double> [[TMP2]] -; -; PTR32-LABEL: @strided_load_3x3_const_stride_i128( -; PTR32-NEXT: entry: -; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8 -; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16 -; PTR32-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 -; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32 -; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 -; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> -; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> -; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> -; PTR32-NEXT: ret <9 x double> [[TMP2]] -; -entry: - %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 16, i1 false, i32 3, i32 3) - ret <9 x double> %load -} - define <9 x double> @strided_load_3x3_i64(ptr %in, i64 %stride) { -; PTR128-LABEL: @strided_load_3x3_i64( -; PTR128-NEXT: entry: -; PTR128-NEXT: [[STRIDE_CAST:%.*]] = zext i64 [[STRIDE:%.*]] to i128 -; PTR128-NEXT: [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]] -; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]] -; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 -; PTR128-NEXT: [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]] -; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]] -; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 -; PTR128-NEXT: [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]] -; PTR128-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]] -; PTR128-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8 -; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> -; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> -; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> -; PTR128-NEXT: ret <9 x double> [[TMP2]] -; ; PTR64-LABEL: @strided_load_3x3_i64( ; PTR64-NEXT: entry: ; PTR64-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]] @@ -168,18 +52,6 @@ entry: } define <9 x double> @strided_load_3x3_const_stride_i64(ptr %in) { -; PTR128-LABEL: @strided_load_3x3_const_stride_i64( -; PTR128-NEXT: entry: -; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8 -; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16 -; PTR128-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 -; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32 -; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 -; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> -; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> -; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> -; PTR128-NEXT: ret <9 x double> [[TMP2]] -; ; PTR64-LABEL: @strided_load_3x3_const_stride_i64( ; PTR64-NEXT: entry: ; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8 @@ -210,23 +82,6 @@ entry: } define <9 x double> @strided_load_3x3_i32(ptr %in, i32 %stride) { -; PTR128-LABEL: @strided_load_3x3_i32( -; PTR128-NEXT: entry: -; PTR128-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i128 -; PTR128-NEXT: [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]] -; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]] -; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 -; PTR128-NEXT: [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]] -; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]] -; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 -; PTR128-NEXT: [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]] -; PTR128-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]] -; PTR128-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8 -; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> -; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> -; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> -; PTR128-NEXT: ret <9 x double> [[TMP2]] -; ; PTR64-LABEL: @strided_load_3x3_i32( ; PTR64-NEXT: entry: ; PTR64-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64 @@ -266,18 +121,6 @@ entry: } define <9 x double> @strided_load_3x3_const_stride_i32(ptr %in) { -; PTR128-LABEL: @strided_load_3x3_const_stride_i32( -; PTR128-NEXT: entry: -; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8 -; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16 -; PTR128-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 -; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32 -; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 -; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> -; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> -; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> -; PTR128-NEXT: ret <9 x double> [[TMP2]] -; ; PTR64-LABEL: @strided_load_3x3_const_stride_i32( ; PTR64-NEXT: entry: ; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8 @@ -307,6 +150,5 @@ entry: ret <9 x double> %load } -declare <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr, i128, i1, i32, i32) declare <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr, i64, i1, i32, i32) declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll new file mode 100644 index 0000000..4ec5898 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='lower-matrix-intrinsics' -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s +; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=96 -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=NO_SPLIT_REMAINDER %s +; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=64 -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s + +; REQUIRES: aarch64-registered-target + +target datalayout = "e-m:o-i64:64-f80:128-n8:8:32:64-S128" +target triple = "aarch64-apple-ios" + +define void @matmul(ptr %a, ptr %b, ptr %c) { +; SPLIT_REMAINDER-LABEL: define void @matmul( +; SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) { +; SPLIT_REMAINDER-NEXT: [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4 +; SPLIT_REMAINDER-NEXT: [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[B]], align 4 +; SPLIT_REMAINDER-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[B]], i64 3 +; SPLIT_REMAINDER-NEXT: [[COL_LOAD2:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4 +; SPLIT_REMAINDER-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr [[B]], i64 6 +; SPLIT_REMAINDER-NEXT: [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[VEC_GEP3]], align 4 +; SPLIT_REMAINDER-NEXT: [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <2 x i32> <i32 0, i32 1> +; SPLIT_REMAINDER-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i64 0 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer +; SPLIT_REMAINDER-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[SPLAT_SPLAT]], [[BLOCK]] +; SPLIT_REMAINDER-NEXT: [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <2 x i32> <i32 0, i32 1> +; SPLIT_REMAINDER-NEXT: [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT6]], <2 x float> poison, <2 x i32> zeroinitializer +; SPLIT_REMAINDER-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[SPLAT_SPLAT7]], [[BLOCK5]] +; SPLIT_REMAINDER-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP2]], [[TMP4]] +; SPLIT_REMAINDER-NEXT: [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <2 x i32> <i32 0, i32 1> +; SPLIT_REMAINDER-NEXT: [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT9]], <2 x float> poison, <2 x i32> zeroinitializer +; SPLIT_REMAINDER-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[SPLAT_SPLAT10]], [[BLOCK8]] +; SPLIT_REMAINDER-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]] +; SPLIT_REMAINDER-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison> +; SPLIT_REMAINDER-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 2> +; SPLIT_REMAINDER-NEXT: [[BLOCK11:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <1 x i32> <i32 2> +; SPLIT_REMAINDER-NEXT: [[TMP11:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT12]], <1 x float> poison, <1 x i32> zeroinitializer +; SPLIT_REMAINDER-NEXT: [[TMP12:%.*]] = fmul <1 x float> [[SPLAT_SPLAT13]], [[BLOCK11]] +; SPLIT_REMAINDER-NEXT: [[BLOCK14:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <1 x i32> <i32 2> +; SPLIT_REMAINDER-NEXT: [[TMP13:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT15]], <1 x float> poison, <1 x i32> zeroinitializer +; SPLIT_REMAINDER-NEXT: [[TMP14:%.*]] = fmul <1 x float> [[SPLAT_SPLAT16]], [[BLOCK14]] +; SPLIT_REMAINDER-NEXT: [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]] +; SPLIT_REMAINDER-NEXT: [[BLOCK17:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <1 x i32> <i32 2> +; SPLIT_REMAINDER-NEXT: [[TMP16:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT18]], <1 x float> poison, <1 x i32> zeroinitializer +; SPLIT_REMAINDER-NEXT: [[TMP17:%.*]] = fmul <1 x float> [[SPLAT_SPLAT19]], [[BLOCK17]] +; SPLIT_REMAINDER-NEXT: [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]] +; SPLIT_REMAINDER-NEXT: [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <3 x i32> <i32 0, i32 poison, i32 poison> +; SPLIT_REMAINDER-NEXT: [[TMP20:%.*]] = shufflevector <3 x float> [[TMP10]], <3 x float> [[TMP19]], <3 x i32> <i32 0, i32 1, i32 3> +; SPLIT_REMAINDER-NEXT: store <3 x float> [[TMP20]], ptr [[C]], align 4 +; SPLIT_REMAINDER-NEXT: ret void +; +; NO_SPLIT_REMAINDER-LABEL: define void @matmul( +; NO_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) { +; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4 +; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[B]], align 4 +; NO_SPLIT_REMAINDER-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[B]], i64 3 +; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD2:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4 +; NO_SPLIT_REMAINDER-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr [[B]], i64 6 +; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[VEC_GEP3]], align 4 +; NO_SPLIT_REMAINDER-NEXT: [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2> +; NO_SPLIT_REMAINDER-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0 +; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i64 0 +; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT]], <3 x float> poison, <3 x i32> zeroinitializer +; NO_SPLIT_REMAINDER-NEXT: [[TMP2:%.*]] = fmul <3 x float> [[SPLAT_SPLAT]], [[BLOCK]] +; NO_SPLIT_REMAINDER-NEXT: [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2> +; NO_SPLIT_REMAINDER-NEXT: [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1 +; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <3 x float> poison, float [[TMP3]], i64 0 +; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT6]], <3 x float> poison, <3 x i32> zeroinitializer +; NO_SPLIT_REMAINDER-NEXT: [[TMP4:%.*]] = fmul <3 x float> [[SPLAT_SPLAT7]], [[BLOCK5]] +; NO_SPLIT_REMAINDER-NEXT: [[TMP5:%.*]] = fadd <3 x float> [[TMP2]], [[TMP4]] +; NO_SPLIT_REMAINDER-NEXT: [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2> +; NO_SPLIT_REMAINDER-NEXT: [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2 +; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT9:%.*]] = insertelement <3 x float> poison, float [[TMP6]], i64 0 +; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT9]], <3 x float> poison, <3 x i32> zeroinitializer +; NO_SPLIT_REMAINDER-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[SPLAT_SPLAT10]], [[BLOCK8]] +; NO_SPLIT_REMAINDER-NEXT: [[TMP8:%.*]] = fadd <3 x float> [[TMP5]], [[TMP7]] +; NO_SPLIT_REMAINDER-NEXT: [[TMP9:%.*]] = shufflevector <3 x float> [[TMP8]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2> +; NO_SPLIT_REMAINDER-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 5> +; NO_SPLIT_REMAINDER-NEXT: store <3 x float> [[TMP10]], ptr [[C]], align 4 +; NO_SPLIT_REMAINDER-NEXT: ret void +; + %a_load = load <3 x float>, ptr %a, align 4 + %b_load = load <9 x float>, ptr %b, align 4 + %matmul = tail call <3 x float> @llvm.matrix.multiply.v3f32.v9f32.v3f32(<3 x float> %a_load, <9 x float> %b_load, i32 1, i32 3, i32 3) + store <3 x float> %matmul, ptr %c, align 4 + ret void +} diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll new file mode 100644 index 0000000..fbc2cbc --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s +; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=96 -S < %s | FileCheck --check-prefix=NO_SPLIT_REMAINDER %s +; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=64 -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s + +; REQUIRES: aarch64-registered-target + +target datalayout = "e-m:o-i64:64-f80:128-n8:8:32:64-S128" +target triple = "aarch64-apple-ios" + +define void @matmul(ptr %a, ptr %b, ptr %c) { +; SPLIT_REMAINDER-LABEL: define void @matmul( +; SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) { +; SPLIT_REMAINDER-NEXT: [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4 +; SPLIT_REMAINDER-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[A]], i64 3 +; SPLIT_REMAINDER-NEXT: [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4 +; SPLIT_REMAINDER-NEXT: [[VEC_GEP2:%.*]] = getelementptr float, ptr [[A]], i64 6 +; SPLIT_REMAINDER-NEXT: [[COL_LOAD3:%.*]] = load <3 x float>, ptr [[VEC_GEP2]], align 4 +; SPLIT_REMAINDER-NEXT: [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[B]], align 4 +; SPLIT_REMAINDER-NEXT: [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <2 x i32> <i32 0, i32 1> +; SPLIT_REMAINDER-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i64 0 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer +; SPLIT_REMAINDER-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[BLOCK]], [[SPLAT_SPLAT]] +; SPLIT_REMAINDER-NEXT: [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <2 x i32> <i32 0, i32 1> +; SPLIT_REMAINDER-NEXT: [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT6]], <2 x float> poison, <2 x i32> zeroinitializer +; SPLIT_REMAINDER-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[BLOCK5]], [[SPLAT_SPLAT7]] +; SPLIT_REMAINDER-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP2]], [[TMP4]] +; SPLIT_REMAINDER-NEXT: [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <2 x i32> <i32 0, i32 1> +; SPLIT_REMAINDER-NEXT: [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT9]], <2 x float> poison, <2 x i32> zeroinitializer +; SPLIT_REMAINDER-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[BLOCK8]], [[SPLAT_SPLAT10]] +; SPLIT_REMAINDER-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]] +; SPLIT_REMAINDER-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison> +; SPLIT_REMAINDER-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 2> +; SPLIT_REMAINDER-NEXT: [[BLOCK11:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <1 x i32> <i32 2> +; SPLIT_REMAINDER-NEXT: [[TMP11:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT12]], <1 x float> poison, <1 x i32> zeroinitializer +; SPLIT_REMAINDER-NEXT: [[TMP12:%.*]] = fmul <1 x float> [[BLOCK11]], [[SPLAT_SPLAT13]] +; SPLIT_REMAINDER-NEXT: [[BLOCK14:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <1 x i32> <i32 2> +; SPLIT_REMAINDER-NEXT: [[TMP13:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT15]], <1 x float> poison, <1 x i32> zeroinitializer +; SPLIT_REMAINDER-NEXT: [[TMP14:%.*]] = fmul <1 x float> [[BLOCK14]], [[SPLAT_SPLAT16]] +; SPLIT_REMAINDER-NEXT: [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]] +; SPLIT_REMAINDER-NEXT: [[BLOCK17:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <1 x i32> <i32 2> +; SPLIT_REMAINDER-NEXT: [[TMP16:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0 +; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT18]], <1 x float> poison, <1 x i32> zeroinitializer +; SPLIT_REMAINDER-NEXT: [[TMP17:%.*]] = fmul <1 x float> [[BLOCK17]], [[SPLAT_SPLAT19]] +; SPLIT_REMAINDER-NEXT: [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]] +; SPLIT_REMAINDER-NEXT: [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <3 x i32> <i32 0, i32 poison, i32 poison> +; SPLIT_REMAINDER-NEXT: [[TMP20:%.*]] = shufflevector <3 x float> [[TMP10]], <3 x float> [[TMP19]], <3 x i32> <i32 0, i32 1, i32 3> +; SPLIT_REMAINDER-NEXT: store <3 x float> [[TMP20]], ptr [[C]], align 4 +; SPLIT_REMAINDER-NEXT: ret void +; +; NO_SPLIT_REMAINDER-LABEL: define void @matmul( +; NO_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) { +; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4 +; NO_SPLIT_REMAINDER-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[A]], i64 3 +; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4 +; NO_SPLIT_REMAINDER-NEXT: [[VEC_GEP2:%.*]] = getelementptr float, ptr [[A]], i64 6 +; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD3:%.*]] = load <3 x float>, ptr [[VEC_GEP2]], align 4 +; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[B]], align 4 +; NO_SPLIT_REMAINDER-NEXT: [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2> +; NO_SPLIT_REMAINDER-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0 +; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i64 0 +; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT]], <3 x float> poison, <3 x i32> zeroinitializer +; NO_SPLIT_REMAINDER-NEXT: [[TMP2:%.*]] = fmul <3 x float> [[BLOCK]], [[SPLAT_SPLAT]] +; NO_SPLIT_REMAINDER-NEXT: [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2> +; NO_SPLIT_REMAINDER-NEXT: [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1 +; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <3 x float> poison, float [[TMP3]], i64 0 +; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT6]], <3 x float> poison, <3 x i32> zeroinitializer +; NO_SPLIT_REMAINDER-NEXT: [[TMP4:%.*]] = fmul <3 x float> [[BLOCK5]], [[SPLAT_SPLAT7]] +; NO_SPLIT_REMAINDER-NEXT: [[TMP5:%.*]] = fadd <3 x float> [[TMP2]], [[TMP4]] +; NO_SPLIT_REMAINDER-NEXT: [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2> +; NO_SPLIT_REMAINDER-NEXT: [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2 +; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT9:%.*]] = insertelement <3 x float> poison, float [[TMP6]], i64 0 +; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT9]], <3 x float> poison, <3 x i32> zeroinitializer +; NO_SPLIT_REMAINDER-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[BLOCK8]], [[SPLAT_SPLAT10]] +; NO_SPLIT_REMAINDER-NEXT: [[TMP8:%.*]] = fadd <3 x float> [[TMP5]], [[TMP7]] +; NO_SPLIT_REMAINDER-NEXT: [[TMP9:%.*]] = shufflevector <3 x float> [[TMP8]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2> +; NO_SPLIT_REMAINDER-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 5> +; NO_SPLIT_REMAINDER-NEXT: store <3 x float> [[TMP10]], ptr [[C]], align 4 +; NO_SPLIT_REMAINDER-NEXT: ret void +; + %a_load = load <9 x float>, ptr %a, align 4 + %b_load = load <3 x float>, ptr %b, align 4 + %matmul = tail call <3 x float> @llvm.matrix.multiply.v9f32.v3f32.v3f32(<9 x float> %a_load, <3 x float> %b_load, i32 3, i32 3, i32 1) + store <3 x float> %matmul, ptr %c, align 4 + ret void +} diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll b/llvm/test/Transforms/PhaseOrdering/unswitch-cold-func.ll index f1ffcc7..a6ebdf0 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll +++ b/llvm/test/Transforms/PhaseOrdering/unswitch-cold-func.ll @@ -1,13 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; RUN: opt < %s -passes='require<profile-summary>,function(loop-mssa(simple-loop-unswitch<nontrivial>))' -S | FileCheck %s +; RUN: opt < %s -passes='pgo-force-function-attrs,function(loop-mssa(simple-loop-unswitch<nontrivial>))' -pgo-kind=pgo-instr-use-pipeline -pgo-cold-func-opt=optsize -S | FileCheck %s +; RUN: opt < %s -passes='pgo-force-function-attrs,function(loop-mssa(simple-loop-unswitch<nontrivial>))' -pgo-kind=pgo-instr-use-pipeline -pgo-cold-func-opt=minsize -S | FileCheck %s ;; Check that non-trivial loop unswitching is not applied to a cold loop in a ;; cold loop nest. ;; IR was generated from the following loop nest, profiled when called ;; with M=0 and N=0. -;; void hotFunction(bool cond, int M, int N, int * A, int *B, int *C) { +;; void function(bool cond, int M, int N, int * A, int *B, int *C) { ;; for (unsigned j = 0; j < M; j++) ;; for (unsigned i=0; i < N; i++) { ;; A[i] = B[i] + C[i]; @@ -15,9 +16,9 @@ ;; } ;; } -define void @_Z11hotFunctionbiiPiS_S_(i1 %cond, i32 %M, i32 %N, ptr %A, ptr %B, ptr %C) !prof !36 { -; CHECK-LABEL: define void @_Z11hotFunctionbiiPiS_S_ -; CHECK-SAME: (i1 [[COND:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) !prof [[PROF16:![0-9]+]] { +define void @_Z11functionbiiPiS_S_(i1 %cond, i32 %M, i32 %N, ptr %A, ptr %B, ptr %C) !prof !36 { +; CHECK-LABEL: define void @_Z11functionbiiPiS_S_ +; CHECK-SAME: (i1 [[COND:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {{.*}}{ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP19_NOT:%.*]] = icmp eq i32 [[M]], 0 ; CHECK-NEXT: br i1 [[CMP19_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], !prof [[PROF17:![0-9]+]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/last-non-copyable-inst-used-outside-bb.ll b/llvm/test/Transforms/SLPVectorizer/X86/last-non-copyable-inst-used-outside-bb.ll new file mode 100644 index 0000000..2f97b41 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/last-non-copyable-inst-used-outside-bb.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-99999 < %s | FileCheck %s + +define void @test() { +; CHECK-LABEL: define void @test() { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: br label %[[BB1:.*]] +; CHECK: [[BB1]]: +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP7:%.*]], %[[BB16:.*]] ], [ zeroinitializer, %[[BB1]] ] +; CHECK-NEXT: br i1 false, label %[[BB1]], label %[[BB5:.*]] +; CHECK: [[BB5]]: +; CHECK-NEXT: [[PHI8:%.*]] = phi double [ 0.000000e+00, %[[BB16]] ], [ 0.000000e+00, %[[BB1]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <4 x i32> [ [[TMP8:%.*]], %[[BB16]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB1]] ] +; CHECK-NEXT: switch i32 0, label %[[BB21:.*]] [ +; CHECK-NEXT: i32 4, label %[[BB21]] +; CHECK-NEXT: i32 1, label %[[BB21]] +; CHECK-NEXT: i32 0, label %[[BB9:.*]] +; CHECK-NEXT: ] +; CHECK: [[BB9]]: +; CHECK-NEXT: [[PHI13:%.*]] = phi double [ 0.000000e+00, %[[BB21]] ], [ 0.000000e+00, %[[BB5]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ [[TMP1]], %[[BB21]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB5]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i32> [ [[TMP9:%.*]], %[[BB21]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB5]] ] +; CHECK-NEXT: switch i32 0, label %[[BB15:.*]] [ +; CHECK-NEXT: i32 1, label %[[BB14:.*]] +; CHECK-NEXT: i32 0, label %[[BB16]] +; CHECK-NEXT: ] +; CHECK: [[BB14]]: +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, <4 x i32> <i32 0, i32 1, i32 6, i32 3> +; CHECK-NEXT: br label %[[BB16]] +; CHECK: [[BB15]]: +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>, [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP5]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 7> +; CHECK-NEXT: br label %[[BB16]] +; CHECK: [[BB16]]: +; CHECK-NEXT: [[PHI20:%.*]] = phi double [ 0.000000e+00, %[[BB15]] ], [ 0.000000e+00, %[[BB14]] ], [ 0.000000e+00, %[[BB9]] ] +; CHECK-NEXT: [[TMP7]] = phi <4 x i32> [ [[TMP5]], %[[BB15]] ], [ [[TMP4]], %[[BB14]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB9]] ] +; CHECK-NEXT: [[TMP8]] = phi <4 x i32> [ [[TMP6]], %[[BB15]] ], [ [[TMP3]], %[[BB14]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB9]] ] +; CHECK-NEXT: br i1 false, label %[[BB5]], label %[[BB1]] +; CHECK: [[BB21]]: +; CHECK-NEXT: [[TMP9]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, <4 x i32> <i32 0, i32 1, i32 6, i32 3> +; CHECK-NEXT: br label %[[BB9]] +; +bb: + br label %bb1 + +bb1: + %phi = phi i32 [ 0, %bb ], [ 0, %bb1 ], [ %phi17, %bb16 ] + %phi2 = phi i32 [ 0, %bb ], [ 0, %bb1 ], [ %phi18, %bb16 ] + %phi3 = phi i32 [ 0, %bb ], [ poison, %bb16 ], [ 0, %bb1 ] + %phi4 = phi i32 [ 0, %bb ], [ poison, %bb16 ], [ 0, %bb1 ] + br i1 false, label %bb1, label %bb5 + +bb5: + %phi6 = phi i32 [ %phi17, %bb16 ], [ 0, %bb1 ] + %phi7 = phi i32 [ %phi19, %bb16 ], [ 0, %bb1 ] + %phi8 = phi double [ 0.000000e+00, %bb16 ], [ 0.000000e+00, %bb1 ] + switch i32 0, label %bb21 [ + i32 4, label %bb21 + i32 1, label %bb21 + i32 0, label %bb9 + ] + +bb9: + %phi10 = phi i32 [ %phi6, %bb21 ], [ 0, %bb5 ] + %phi11 = phi i32 [ %phi7, %bb21 ], [ 0, %bb5 ] + %phi12 = phi i32 [ 0, %bb21 ], [ 0, %bb5 ] + %phi13 = phi double [ 0.000000e+00, %bb21 ], [ 0.000000e+00, %bb5 ] + switch i32 0, label %bb15 [ + i32 1, label %bb14 + i32 0, label %bb16 + ] + +bb14: + br label %bb16 + +bb15: + %add = add i32 0, %phi10 + br label %bb16 + +bb16: + %phi17 = phi i32 [ %add, %bb15 ], [ %phi10, %bb14 ], [ 0, %bb9 ] + %phi18 = phi i32 [ %phi11, %bb15 ], [ 0, %bb14 ], [ 0, %bb9 ] + %phi19 = phi i32 [ %phi12, %bb15 ], [ %phi12, %bb14 ], [ 0, %bb9 ] + %phi20 = phi double [ 0.000000e+00, %bb15 ], [ 0.000000e+00, %bb14 ], [ 0.000000e+00, %bb9 ] + br i1 false, label %bb5, label %bb1 + +bb21: + br label %bb9 +} diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll index 9ab713c..383407b 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll @@ -18,7 +18,6 @@ ; the analysis caches. ; ; CHECK: Running pass: SimpleLoopUnswitchPass on loop %loop_begin in function test6 -; CHECK-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-NEXT: Clearing all analysis results for: loop_a_inner |