diff options
Diffstat (limited to 'llvm/test/CodeGen/Thumb2')
4 files changed, 760 insertions, 138 deletions
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll index c2b4494..11e7e5c 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll @@ -1,16 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s -; CHECK-LABEL: mul_v16i8 -; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 -; CHECK: vector.body: -; CHECK: %index = phi i32 -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[ELEMS]]) -; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16 -; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef) -; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef) -; CHECK: tail call void @llvm.masked.store.v16i8.p0(<16 x i8> {{.*}}, ptr {{.*}}, i32 4, <16 x i1> [[VCTP]]) define dso_local arm_aapcs_vfpcc void @mul_v16i8(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { +; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mul_v16i8( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 15 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -16 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 16 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP]], i32 4, <16 x i1> [[TMP1]], <16 x i8> undef) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i8> undef) +; CHECK-NEXT: [[MUL:%.*]] = mul nsw <16 x i8> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v16i8.p0(<16 x i8> [[MUL]], ptr [[TMP6]], i32 4, <16 x i1> [[TMP1]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 15 @@ -45,17 +70,41 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } -; CHECK-LABEL: mul_v8i16 -; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 -; CHECK: vector.body: -; CHECK: %index = phi i32 -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]]) -; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8 -; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) -; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) -; CHECK: tail call void @llvm.masked.store.v8i16.p0(<8 x i16> {{.*}}, ptr {{.*}}, i32 4, <8 x i1> [[VCTP]]) define dso_local arm_aapcs_vfpcc void @mul_v8i16(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { +; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mul_v8i16( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 3 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 3 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -8 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef) +; CHECK-NEXT: [[MUL:%.*]] = mul nsw <8 x i16> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v8i16.p0(<8 x i16> [[MUL]], ptr [[TMP6]], i32 4, <8 x i1> [[TMP1]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 7 @@ -90,16 +139,41 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } -; CHECK-LABEL: mul_v4i32 -; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 -; CHECK: vector.body: -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) -; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 -; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) -; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) -; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]]) define dso_local arm_aapcs_vfpcc void @mul_v4i32(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { +; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mul_v4i32( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -4 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[MUL:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[MUL]], ptr [[TMP6]], i32 4, <4 x i1> [[TMP1]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 3 @@ -134,17 +208,47 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } -; CHECK-LABEL: split_vector -; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 -; CHECK: vector.body: -; CHECK: %index = phi i32 -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) -; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 -; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) -; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) -; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]]) define dso_local arm_aapcs_vfpcc void @split_vector(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { +; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @split_vector( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -4 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[EXTRACT_1_LOW:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> undef, <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[EXTRACT_1_HIGH:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> undef, <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[EXTRACT_2_LOW:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD2]], <4 x i32> undef, <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[EXTRACT_2_HIGH:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD2]], <4 x i32> undef, <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[MUL:%.*]] = mul nsw <2 x i32> [[EXTRACT_1_LOW]], [[EXTRACT_2_LOW]] +; CHECK-NEXT: [[SUB:%.*]] = sub nsw <2 x i32> [[EXTRACT_1_HIGH]], [[EXTRACT_2_HIGH]] +; CHECK-NEXT: [[COMBINE:%.*]] = shufflevector <2 x i32> [[MUL]], <2 x i32> [[SUB]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[COMBINE]], ptr [[TMP6]], i32 4, <4 x i1> [[TMP1]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 3 @@ -186,14 +290,48 @@ for.cond.cleanup: ; preds = %vector.body, %entry } ; One of the loads now uses ult predicate. -; CHECK-LABEL: mismatch_load_pred -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) -; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 -; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) -; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef) -; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]]) define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { +; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mismatch_load_pred( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -4 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 +; CHECK-NEXT: [[WRONG:%.*]] = icmp ult <4 x i32> [[INDUCTION]], [[BROADCAST_SPLAT11]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[WRONG]], <4 x i32> undef) +; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP6]], i32 4, <4 x i1> [[TMP1]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 3 @@ -236,17 +374,48 @@ for.cond.cleanup: ; preds = %vector.body, %entry } ; The store now uses ult predicate. -; CHECK-LABEL: mismatch_store_pred -; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 -; CHECK: vector.body: -; CHECK: %index = phi i32 -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) -; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 -; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) -; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) -; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> %wrong) define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { +; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mismatch_store_pred( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -4 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 +; CHECK-NEXT: [[WRONG:%.*]] = icmp ult <4 x i32> [[INDUCTION]], [[BROADCAST_SPLAT11]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP6]], i32 4, <4 x i1> [[WRONG]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 3 @@ -294,14 +463,72 @@ for.cond.cleanup: ; preds = %vector.body, %entry ; ; Step value 16 doesn't match vector width 4 ; -; CHECK-LABEL: interleave4 -; CHECK: vector.body: -; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) -; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N) -; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N) -; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N) -; define dso_local void @interleave4(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @interleave4( +; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: [[V0:%.*]] = add i32 [[N]], 15 +; CHECK-NEXT: [[V1:%.*]] = lshr i32 [[V0]], 4 +; CHECK-NEXT: [[V2:%.*]] = shl nuw i32 [[V1]], 4 +; CHECK-NEXT: [[V3:%.*]] = add i32 [[V2]], -16 +; CHECK-NEXT: [[V4:%.*]] = lshr i32 [[V3]], 4 +; CHECK-NEXT: [[V5:%.*]] = add nuw nsw i32 [[V4]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, ptr [[A]], i32 8 +; CHECK-NEXT: [[SCEVGEP30:%.*]] = getelementptr i32, ptr [[C]], i32 8 +; CHECK-NEXT: [[SCEVGEP37:%.*]] = getelementptr i32, ptr [[B]], i32 8 +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[V5]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[LSR_IV38:%.*]] = phi ptr [ [[SCEVGEP39:%.*]], %[[VECTOR_BODY]] ], [ [[SCEVGEP37]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV31:%.*]] = phi ptr [ [[SCEVGEP32:%.*]], %[[VECTOR_BODY]] ], [ [[SCEVGEP30]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP25:%.*]], %[[VECTOR_BODY]] ], [ [[SCEVGEP]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[V14:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[V6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[V15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[V7:%.*]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK15:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[V7]], i32 [[N]]) +; CHECK-NEXT: [[V8:%.*]] = add i32 [[V7]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK16:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[V8]], i32 [[N]]) +; CHECK-NEXT: [[V9:%.*]] = add i32 [[V8]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK17:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[V9]], i32 [[N]]) +; CHECK-NEXT: [[SCEVGEP42:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 -2 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[SCEVGEP42]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[SCEVGEP43:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 -1 +; CHECK-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[SCEVGEP43]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK15]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD19:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[LSR_IV38]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK16]], <4 x i32> undef) +; CHECK-NEXT: [[SCEVGEP41:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 1 +; CHECK-NEXT: [[WIDE_MASKED_LOAD20:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[SCEVGEP41]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK17]], <4 x i32> undef) +; CHECK-NEXT: [[SCEVGEP34:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 -2 +; CHECK-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[SCEVGEP34]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[SCEVGEP35:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 -1 +; CHECK-NEXT: [[WIDE_MASKED_LOAD22:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[SCEVGEP35]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK15]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD23:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[LSR_IV31]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK16]], <4 x i32> undef) +; CHECK-NEXT: [[SCEVGEP36:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 1 +; CHECK-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[SCEVGEP36]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK17]], <4 x i32> undef) +; CHECK-NEXT: [[V10:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD21]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[V11:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD22]], [[WIDE_MASKED_LOAD18]] +; CHECK-NEXT: [[V12:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD23]], [[WIDE_MASKED_LOAD19]] +; CHECK-NEXT: [[V13:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD24]], [[WIDE_MASKED_LOAD20]] +; CHECK-NEXT: [[SCEVGEP27:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 -2 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V10]], ptr [[SCEVGEP27]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[SCEVGEP28:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 -1 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V11]], ptr [[SCEVGEP28]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK15]]) +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V12]], ptr [[LSR_IV]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK16]]) +; CHECK-NEXT: [[SCEVGEP29:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 1 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V13]], ptr [[SCEVGEP29]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK17]]) +; CHECK-NEXT: [[SCEVGEP25]] = getelementptr i32, ptr [[LSR_IV]], i32 16 +; CHECK-NEXT: [[SCEVGEP32]] = getelementptr i32, ptr [[LSR_IV31]], i32 16 +; CHECK-NEXT: [[SCEVGEP39]] = getelementptr i32, ptr [[LSR_IV38]], i32 16 +; CHECK-NEXT: [[V14]] = add i32 [[V9]], 4 +; CHECK-NEXT: [[V15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[V6]], i32 1) +; CHECK-NEXT: [[V16:%.*]] = icmp ne i32 [[V15]], 0 +; CHECK-NEXT: br i1 [[V16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp sgt i32 %N, 0 %v0 = add i32 %N, 15 @@ -370,12 +597,42 @@ for.cond.cleanup: ret void } -; CHECK-LABEL: const_expected_in_set_loop -; CHECK: call <4 x i1> @llvm.get.active.lane.mask -; CHECK-NOT: vctp -; CHECK: ret void -; define dso_local void @const_expected_in_set_loop(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @const_expected_in_set_loop( +; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], -4 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[LSR_IV17:%.*]] = phi ptr [ [[SCEVGEP18:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], %[[VECTOR_BODY]] ], [ [[C]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 42) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV14]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr [[LSR_IV17]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4 +; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4 +; CHECK-NEXT: [[SCEVGEP18]] = getelementptr i32, ptr [[LSR_IV17]], i32 4 +; CHECK-NEXT: [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1) +; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp sgt i32 %N, 0 %0 = add i32 %N, 3 @@ -413,12 +670,42 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } -; CHECK-LABEL: tripcount_arg_not_invariant -; CHECK: call <4 x i1> @llvm.get.active.lane.mask -; CHECK-NOT: vctp -; CHECK: ret void -; define dso_local void @tripcount_arg_not_invariant(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @tripcount_arg_not_invariant( +; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], -4 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[LSR_IV17:%.*]] = phi ptr [ [[SCEVGEP18:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], %[[VECTOR_BODY]] ], [ [[C]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[INDEX]]) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV14]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr [[LSR_IV17]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4 +; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4 +; CHECK-NEXT: [[SCEVGEP18]] = getelementptr i32, ptr [[LSR_IV17]], i32 4 +; CHECK-NEXT: [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1) +; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[VECTOR_BODY]], label %[[VECTOR_PH]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp sgt i32 %N, 0 %0 = add i32 %N, 3 @@ -458,12 +745,42 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } -; CHECK-LABEL: addrec_base_not_zero -; CHECK: call <4 x i1> @llvm.get.active.lane.mask -; CHECK-NOT: vctp -; CHECK: ret void -; define dso_local void @addrec_base_not_zero(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @addrec_base_not_zero( +; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], -4 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[LSR_IV17:%.*]] = phi ptr [ [[SCEVGEP18:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], %[[VECTOR_BODY]] ], [ [[C]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 1, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV14]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr [[LSR_IV17]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4 +; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4 +; CHECK-NEXT: [[SCEVGEP18]] = getelementptr i32, ptr [[LSR_IV17]], i32 4 +; CHECK-NEXT: [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1) +; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[VECTOR_BODY]], label %[[VECTOR_PH]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp sgt i32 %N, 0 %0 = add i32 %N, 3 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll index fa6a66b..9775cf9 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll @@ -1,15 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name INST --version 6 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s -; CHECK-LABEL: reduction_i32 -; CHECK: phi i32 [ 0, %vector.ph ] -; CHECK: phi <8 x i16> [ zeroinitializer, %vector.ph ] -; CHECK: phi i32 -; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]]) -; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 -; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp2, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) -; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp5, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) define i16 @reduction_i32(ptr nocapture readonly %A, ptr nocapture readonly %B, i32 %N) { +; CHECK-LABEL: define i16 @reduction_i32( +; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N_VEC]], -8 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]]) +; CHECK-NEXT: [[TMP6]] = sub i32 [[TMP4]], 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP2]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef) +; CHECK-NEXT: [[INSTTMP5:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP5]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef) +; CHECK-NEXT: [[TMP7:%.*]] = add <8 x i16> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP8]] = add <8 x i16> [[TMP7]], [[WIDE_MASKED_LOAD3]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP9]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1) +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP9]], 0 +; CHECK-NEXT: br i1 [[TMP12]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[VEC_PHI_LCSSA:%.*]] = phi <8 x i16> [ [[VEC_PHI]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTLCSSA3:%.*]] = phi <8 x i1> [ [[TMP5]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <8 x i16> [ [[TMP8]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[DOTLCSSA3]], <8 x i16> [[DOTLCSSA]], <8 x i16> [[VEC_PHI_LCSSA]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[TMP10]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[RDX_SHUF]], [[TMP10]] +; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <8 x i16> [[RDX_SHUF4]], [[BIN_RDX]] +; CHECK-NEXT: [[RDX_SHUF6:%.*]] = shufflevector <8 x i16> [[BIN_RDX5]], <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <8 x i16> [[RDX_SHUF6]], [[BIN_RDX5]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[BIN_RDX7]], i32 0 +; CHECK-NEXT: ret i16 [[TMP11]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: [[RES_0:%.*]] = phi i16 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: ret i16 [[RES_0]] +; entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph @@ -59,16 +99,52 @@ for.cond.cleanup: ret i16 %res.0 } -; CHECK-LABEL: reduction_i32_with_scalar -; CHECK: vector.body: -; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK: %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %{{.*}}, %vector.body ] -; CHECK: %{{.*}} = phi i32 [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ] -; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]]) -; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 -; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp2, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) define i16 @reduction_i32_with_scalar(ptr nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { +; CHECK-LABEL: define i16 @reduction_i32_with_scalar( +; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], i16 [[B:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i32 [[TMP]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT3]], <8 x i16> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N_VEC]], -8 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[INSTTMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]]) +; CHECK-NEXT: [[TMP6]] = sub i32 [[TMP4]], 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP2]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef) +; CHECK-NEXT: [[INSTTMP5:%.*]] = add <8 x i16> [[VEC_PHI]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[INSTTMP6]] = add <8 x i16> [[INSTTMP5]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP7]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1) +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i32 [[TMP7]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[INSTTMP8:%.*]] = select <8 x i1> [[TMP5]], <8 x i16> [[INSTTMP6]], <8 x i16> [[VEC_PHI]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[INSTTMP8]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[RDX_SHUF]], [[INSTTMP8]] +; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <8 x i16> [[RDX_SHUF5]], [[BIN_RDX]] +; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <8 x i16> [[BIN_RDX6]], <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <8 x i16> [[RDX_SHUF7]], [[BIN_RDX6]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[BIN_RDX8]], i32 0 +; CHECK-NEXT: ret i16 [[TMP9]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: [[RES_0:%.*]] = phi i16 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: ret i16 [[RES_0]] +; entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph @@ -119,15 +195,46 @@ for.cond.cleanup: ; despite this we can still calculate a precise enough range so that the ; the overflow checks for get.active.active.lane.mask don't reject ; tail-predication. -; -; CHECK-LABEL: @reduction_not_guarded -; -; CHECK: vector.body: -; CHECK: @llvm.arm.mve.vctp -; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32 -; CHECK: ret -; define i16 @reduction_not_guarded(ptr nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { +; CHECK-LABEL: define i16 @reduction_not_guarded( +; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], i16 [[B:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i32 [[TMP]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT3]], <8 x i16> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N_VEC]], -8 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[INSTTMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[N]], %[[ENTRY]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]]) +; CHECK-NEXT: [[TMP6]] = sub i32 [[TMP4]], 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP2]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef) +; CHECK-NEXT: [[INSTTMP5:%.*]] = add <8 x i16> [[VEC_PHI]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[INSTTMP6]] = add <8 x i16> [[INSTTMP5]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP7]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1) +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i32 [[TMP7]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[INSTTMP8:%.*]] = select <8 x i1> [[TMP5]], <8 x i16> [[INSTTMP6]], <8 x i16> [[VEC_PHI]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[INSTTMP8]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[RDX_SHUF]], [[INSTTMP8]] +; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <8 x i16> [[RDX_SHUF5]], [[BIN_RDX]] +; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <8 x i16> [[BIN_RDX6]], <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <8 x i16> [[RDX_SHUF7]], [[BIN_RDX6]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[BIN_RDX8]], i32 0 +; CHECK-NEXT: ret i16 [[TMP9]] +; entry: %tmp = add i32 %N, -1 %n.rnd.up = add nuw nsw i32 %tmp, 8 @@ -166,12 +273,76 @@ middle.block: ; preds = %vector.body ret i16 %tmp9 } -; CHECK-LABEL: @Correlation -; CHECK: vector.body: -; CHECK: @llvm.arm.mve.vctp -; CHECK-NOT: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask -; define dso_local void @Correlation(ptr nocapture readonly %Input, ptr nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @Correlation( +; CHECK-SAME: ptr readonly captures(none) [[INPUT:%.*]], ptr captures(none) [[OUTPUT:%.*]], i16 signext [[SIZE:%.*]], i16 signext [[N:%.*]], i16 signext [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[N]] to i32 +; CHECK-NEXT: [[CMP36:%.*]] = icmp sgt i16 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP36]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END17:.*]] +; CHECK: [[FOR_BODY_LR_PH]]: +; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[SIZE]] to i32 +; CHECK-NEXT: [[CONV1032:%.*]] = zext i16 [[SCALE]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[CONV2]], 3 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[LSR_IV51:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], %[[FOR_END:.*]] ], [ [[TMP0]], %[[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[LSR_IV46:%.*]] = phi ptr [ [[SCEVGEP47:%.*]], %[[FOR_END]] ], [ [[INPUT]], %[[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[I_037:%.*]] = phi i32 [ 0, %[[FOR_BODY_LR_PH]] ], [ [[INC16:%.*]], %[[FOR_END]] ] +; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i32 [[I_037]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[CONV2]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = mul nsw i32 [[I_037]], -1 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw i32 [[TMP5]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], -4 +; CHECK-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw i32 [[TMP8]], 1 +; CHECK-NEXT: [[CMP433:%.*]] = icmp slt i32 [[I_037]], [[CONV2]] +; CHECK-NEXT: br i1 [[CMP433]], label %[[VECTOR_PH:.*]], label %[[FOR_END]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP9]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[LSR_IV48:%.*]] = phi ptr [ [[SCEVGEP49:%.*]], %[[VECTOR_BODY]] ], [ [[LSR_IV46]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[INPUT]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi i32 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP11]]) +; CHECK-NEXT: [[TMP13]] = sub i32 [[TMP11]], 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[LSR_IV]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef) +; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[WIDE_MASKED_LOAD42:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[LSR_IV48]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef) +; CHECK-NEXT: [[TMP15:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD42]] to <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1032]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = ashr <4 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP20]] = add <4 x i32> [[TMP19]], [[VEC_PHI]] +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i16, ptr [[LSR_IV]], i32 4 +; CHECK-NEXT: [[SCEVGEP49]] = getelementptr i16, ptr [[LSR_IV48]], i32 4 +; CHECK-NEXT: [[TMP21]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP10]], i32 1) +; CHECK-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 +; CHECK-NEXT: br i1 [[TMP22]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP23:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> [[TMP20]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP23]]) +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, %[[FOR_BODY]] ], [ [[TMP24]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP25:%.*]] = lshr i32 [[SUM_0_LCSSA]], 16 +; CHECK-NEXT: [[CONV13:%.*]] = trunc i32 [[TMP25]] to i16 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[OUTPUT]], i32 [[I_037]] +; CHECK-NEXT: store i16 [[CONV13]], ptr [[ARRAYIDX14]], align 2 +; CHECK-NEXT: [[INC16]] = add nuw nsw i32 [[I_037]], 1 +; CHECK-NEXT: [[SCEVGEP47]] = getelementptr i16, ptr [[LSR_IV46]], i32 1 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV51]], -1 +; CHECK-NEXT: [[EXITCOND39:%.*]] = icmp eq i32 [[INC16]], [[CONV]] +; CHECK-NEXT: br i1 [[EXITCOND39]], label %[[FOR_END17]], label %[[FOR_BODY]] +; CHECK: [[FOR_END17]]: +; CHECK-NEXT: ret void +; entry: %conv = sext i16 %N to i32 %cmp36 = icmp sgt i16 %N, 0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll index a8ad360..b54d526 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll @@ -1,8 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s -; CHECK-LABEL: expand_v8i16_v8i32 -; CHECK-NOT: call i32 @llvm.arm.mve.vctp define void @expand_v8i16_v8i32(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { +; CHECK-LABEL: define void @expand_v8i16_v8i32( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 3 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 3 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -8 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef) +; CHECK-NEXT: [[EXPAND_1:%.*]] = zext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32> +; CHECK-NEXT: [[EXPAND_2:%.*]] = zext <8 x i16> [[WIDE_MASKED_LOAD2]] to <8 x i32> +; CHECK-NEXT: [[MUL:%.*]] = mul nsw <8 x i32> [[EXPAND_2]], [[EXPAND_1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> [[MUL]], ptr [[TMP6]], i32 4, <8 x i1> [[TMP1]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 7 @@ -39,15 +74,57 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } -; CHECK-LABEL: expand_v8i16_v4i32 -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS_REM:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]]) -; CHECK: [[ELEMS_REM]] = sub i32 [[ELEMS]], 8 -; CHECK: tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) -; CHECK: %store.pred = icmp ule <4 x i32> %induction.store -; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> %store.pred) -; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> %store.pred) define void @expand_v8i16_v4i32(ptr readonly %a, ptr readonly %b, ptr %c, ptr %d, i32 %N) { +; CHECK-LABEL: define void @expand_v8i16_v4i32( +; CHECK-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 3 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 3 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -8 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT10_STORE:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT11_STORE:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10_STORE]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[STORE_IDX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[STORE_IDX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef) +; CHECK-NEXT: [[EXTRACT_2_LOW:%.*]] = shufflevector <8 x i16> [[WIDE_MASKED_LOAD2]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[EXTRACT_2_HIGH:%.*]] = shufflevector <8 x i16> [[WIDE_MASKED_LOAD2]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> +; CHECK-NEXT: [[EXPAND_1:%.*]] = zext <4 x i16> [[EXTRACT_2_LOW]] to <4 x i32> +; CHECK-NEXT: [[EXPAND_2:%.*]] = zext <4 x i16> [[EXTRACT_2_HIGH]] to <4 x i32> +; CHECK-NEXT: [[MUL:%.*]] = mul nsw <4 x i32> [[EXPAND_2]], [[EXPAND_1]] +; CHECK-NEXT: [[SUB:%.*]] = mul nsw <4 x i32> [[EXPAND_1]], [[EXPAND_2]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT_STORE:%.*]] = insertelement <4 x i32> undef, i32 [[STORE_IDX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT_STORE:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT_STORE]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION_STORE:%.*]] = add <4 x i32> [[BROADCAST_SPLAT_STORE]], <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[STORE_PRED:%.*]] = icmp ule <4 x i32> [[INDUCTION_STORE]], [[BROADCAST_SPLAT11_STORE]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[STORE_IDX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[MUL]], ptr [[TMP6]], i32 4, <4 x i1> [[STORE_PRED]]) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[D]], i32 [[STORE_IDX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[SUB]], ptr [[GEP]], i32 4, <4 x i1> [[STORE_PRED]]) +; CHECK-NEXT: [[STORE_IDX_NEXT]] = add i32 [[STORE_IDX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 7 @@ -98,9 +175,43 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } -; CHECK-LABEL: expand_v4i32_v4i64 -; CHECK-NOT: call i32 @llvm.arm.mve.vctp define void @expand_v4i32_v4i64(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { +; CHECK-LABEL: define void @expand_v4i32_v4i64( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -4 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[EXPAND_1:%.*]] = zext <4 x i32> [[WIDE_MASKED_LOAD]] to <4 x i64> +; CHECK-NEXT: [[EXPAND_2:%.*]] = zext <4 x i32> [[WIDE_MASKED_LOAD2]] to <4 x i64> +; CHECK-NEXT: [[MUL:%.*]] = mul nsw <4 x i64> [[EXPAND_2]], [[EXPAND_1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[MUL]], ptr [[TMP6]], i32 4, <4 x i1> [[TMP1]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 3 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll index ec542df..fb1a4a4 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll @@ -1,24 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s -; CHECK-LABEL: vec_mul_reduce_add - -; CHECK: vector.ph: -; CHECK: %start = call i32 @llvm.start.loop.iterations.i32 -; CHECK: br label %vector.body - -; CHECK: vector.body: -; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELTS]]) -; CHECK: [[SUB]] = sub i32 [[ELTS]], 4 -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]] -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], - -; CHECK: middle.block: -; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP]], -; CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]]) - define i32 @vec_mul_reduce_add(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) { +; CHECK-LABEL: define i32 @vec_mul_reduce_add( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], -4 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[LSR_IV2:%.*]] = phi ptr [ [[SCEVGEP3:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP7]]) +; CHECK-NEXT: [[TMP9]] = sub i32 [[TMP7]], 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV2]], i32 4, <4 x i1> [[TMP8]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP8]], <4 x i32> undef) +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP11]] = add nsw <4 x i32> [[TMP10]], [[VEC_PHI]] +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4 +; CHECK-NEXT: [[SCEVGEP3]] = getelementptr i32, ptr [[LSR_IV2]], i32 4 +; CHECK-NEXT: [[TMP12]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1) +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0 +; CHECK-NEXT: br i1 [[TMP13]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]]) +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RES_0_LCSSA]] +; entry: %cmp8 = icmp eq i32 %N, 0 %0 = add i32 %N, 3 |