diff options
Diffstat (limited to 'llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll')
-rw-r--r-- | llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll | 233 |
1 files changed, 202 insertions, 31 deletions
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll index fa6a66b..9775cf9 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll @@ -1,15 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name INST --version 6 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s -; CHECK-LABEL: reduction_i32 -; CHECK: phi i32 [ 0, %vector.ph ] -; CHECK: phi <8 x i16> [ zeroinitializer, %vector.ph ] -; CHECK: phi i32 -; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]]) -; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 -; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp2, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) -; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp5, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) define i16 @reduction_i32(ptr nocapture readonly %A, ptr nocapture readonly %B, i32 %N) { +; CHECK-LABEL: define i16 @reduction_i32( +; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N_VEC]], -8 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]]) +; CHECK-NEXT: [[TMP6]] = sub i32 [[TMP4]], 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP2]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef) +; CHECK-NEXT: [[INSTTMP5:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP5]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef) +; CHECK-NEXT: [[TMP7:%.*]] = add <8 x i16> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP8]] = add <8 x i16> [[TMP7]], [[WIDE_MASKED_LOAD3]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP9]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1) +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP9]], 0 +; CHECK-NEXT: br i1 [[TMP12]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[VEC_PHI_LCSSA:%.*]] = phi <8 x i16> [ [[VEC_PHI]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTLCSSA3:%.*]] = phi <8 x i1> [ [[TMP5]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <8 x i16> [ [[TMP8]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[DOTLCSSA3]], <8 x i16> [[DOTLCSSA]], <8 x i16> [[VEC_PHI_LCSSA]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[TMP10]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[RDX_SHUF]], [[TMP10]] +; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <8 x i16> [[RDX_SHUF4]], [[BIN_RDX]] +; CHECK-NEXT: [[RDX_SHUF6:%.*]] = shufflevector <8 x i16> [[BIN_RDX5]], <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <8 x i16> [[RDX_SHUF6]], [[BIN_RDX5]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[BIN_RDX7]], i32 0 +; CHECK-NEXT: ret i16 [[TMP11]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: [[RES_0:%.*]] = phi i16 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: ret i16 [[RES_0]] +; entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph @@ -59,16 +99,52 @@ for.cond.cleanup: ret i16 %res.0 } -; CHECK-LABEL: reduction_i32_with_scalar -; CHECK: vector.body: -; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK: %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %{{.*}}, %vector.body ] -; CHECK: %{{.*}} = phi i32 [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ] -; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]]) -; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 -; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp2, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) define i16 @reduction_i32_with_scalar(ptr nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { +; CHECK-LABEL: define i16 @reduction_i32_with_scalar( +; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], i16 [[B:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i32 [[TMP]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT3]], <8 x i16> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N_VEC]], -8 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[INSTTMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]]) +; CHECK-NEXT: [[TMP6]] = sub i32 [[TMP4]], 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP2]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef) +; CHECK-NEXT: [[INSTTMP5:%.*]] = add <8 x i16> [[VEC_PHI]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[INSTTMP6]] = add <8 x i16> [[INSTTMP5]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP7]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1) +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i32 [[TMP7]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[INSTTMP8:%.*]] = select <8 x i1> [[TMP5]], <8 x i16> [[INSTTMP6]], <8 x i16> [[VEC_PHI]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[INSTTMP8]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[RDX_SHUF]], [[INSTTMP8]] +; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <8 x i16> [[RDX_SHUF5]], [[BIN_RDX]] +; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <8 x i16> [[BIN_RDX6]], <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <8 x i16> [[RDX_SHUF7]], [[BIN_RDX6]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[BIN_RDX8]], i32 0 +; CHECK-NEXT: ret i16 [[TMP9]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: [[RES_0:%.*]] = phi i16 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: ret i16 [[RES_0]] +; entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph @@ -119,15 +195,46 @@ for.cond.cleanup: ; despite this we can still calculate a precise enough range so that the ; the overflow checks for get.active.active.lane.mask don't reject ; tail-predication. -; -; CHECK-LABEL: @reduction_not_guarded -; -; CHECK: vector.body: -; CHECK: @llvm.arm.mve.vctp -; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32 -; CHECK: ret -; define i16 @reduction_not_guarded(ptr nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { +; CHECK-LABEL: define i16 @reduction_not_guarded( +; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], i16 [[B:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i32 [[TMP]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT3]], <8 x i16> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N_VEC]], -8 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[INSTTMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[N]], %[[ENTRY]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]]) +; CHECK-NEXT: [[TMP6]] = sub i32 [[TMP4]], 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP2]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef) +; CHECK-NEXT: [[INSTTMP5:%.*]] = add <8 x i16> [[VEC_PHI]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[INSTTMP6]] = add <8 x i16> [[INSTTMP5]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP7]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1) +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i32 [[TMP7]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[INSTTMP8:%.*]] = select <8 x i1> [[TMP5]], <8 x i16> [[INSTTMP6]], <8 x i16> [[VEC_PHI]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[INSTTMP8]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[RDX_SHUF]], [[INSTTMP8]] +; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <8 x i16> [[RDX_SHUF5]], [[BIN_RDX]] +; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <8 x i16> [[BIN_RDX6]], <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <8 x i16> [[RDX_SHUF7]], [[BIN_RDX6]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[BIN_RDX8]], i32 0 +; CHECK-NEXT: ret i16 [[TMP9]] +; entry: %tmp = add i32 %N, -1 %n.rnd.up = add nuw nsw i32 %tmp, 8 @@ -166,12 +273,76 @@ middle.block: ; preds = %vector.body ret i16 %tmp9 } -; CHECK-LABEL: @Correlation -; CHECK: vector.body: -; CHECK: @llvm.arm.mve.vctp -; CHECK-NOT: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask -; define dso_local void @Correlation(ptr nocapture readonly %Input, ptr nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @Correlation( +; CHECK-SAME: ptr readonly captures(none) [[INPUT:%.*]], ptr captures(none) [[OUTPUT:%.*]], i16 signext [[SIZE:%.*]], i16 signext [[N:%.*]], i16 signext [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[N]] to i32 +; CHECK-NEXT: [[CMP36:%.*]] = icmp sgt i16 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP36]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END17:.*]] +; CHECK: [[FOR_BODY_LR_PH]]: +; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[SIZE]] to i32 +; CHECK-NEXT: [[CONV1032:%.*]] = zext i16 [[SCALE]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[CONV2]], 3 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[LSR_IV51:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], %[[FOR_END:.*]] ], [ [[TMP0]], %[[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[LSR_IV46:%.*]] = phi ptr [ [[SCEVGEP47:%.*]], %[[FOR_END]] ], [ [[INPUT]], %[[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[I_037:%.*]] = phi i32 [ 0, %[[FOR_BODY_LR_PH]] ], [ [[INC16:%.*]], %[[FOR_END]] ] +; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i32 [[I_037]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[CONV2]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = mul nsw i32 [[I_037]], -1 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw i32 [[TMP5]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], -4 +; CHECK-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw i32 [[TMP8]], 1 +; CHECK-NEXT: [[CMP433:%.*]] = icmp slt i32 [[I_037]], [[CONV2]] +; CHECK-NEXT: br i1 [[CMP433]], label %[[VECTOR_PH:.*]], label %[[FOR_END]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP9]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[LSR_IV48:%.*]] = phi ptr [ [[SCEVGEP49:%.*]], %[[VECTOR_BODY]] ], [ [[LSR_IV46]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[INPUT]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi i32 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP11]]) +; CHECK-NEXT: [[TMP13]] = sub i32 [[TMP11]], 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[LSR_IV]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef) +; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[WIDE_MASKED_LOAD42:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[LSR_IV48]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef) +; CHECK-NEXT: [[TMP15:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD42]] to <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1032]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = ashr <4 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP20]] = add <4 x i32> [[TMP19]], [[VEC_PHI]] +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i16, ptr [[LSR_IV]], i32 4 +; CHECK-NEXT: [[SCEVGEP49]] = getelementptr i16, ptr [[LSR_IV48]], i32 4 +; CHECK-NEXT: [[TMP21]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP10]], i32 1) +; CHECK-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 +; CHECK-NEXT: br i1 [[TMP22]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP23:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> [[TMP20]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP23]]) +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, %[[FOR_BODY]] ], [ [[TMP24]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP25:%.*]] = lshr i32 [[SUM_0_LCSSA]], 16 +; CHECK-NEXT: [[CONV13:%.*]] = trunc i32 [[TMP25]] to i16 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[OUTPUT]], i32 [[I_037]] +; CHECK-NEXT: store i16 [[CONV13]], ptr [[ARRAYIDX14]], align 2 +; CHECK-NEXT: [[INC16]] = add nuw nsw i32 [[I_037]], 1 +; CHECK-NEXT: [[SCEVGEP47]] = getelementptr i16, ptr [[LSR_IV46]], i32 1 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV51]], -1 +; CHECK-NEXT: [[EXITCOND39:%.*]] = icmp eq i32 [[INC16]], [[CONV]] +; CHECK-NEXT: br i1 [[EXITCOND39]], label %[[FOR_END17]], label %[[FOR_BODY]] +; CHECK: [[FOR_END17]]: +; CHECK-NEXT: ret void +; entry: %conv = sext i16 %N to i32 %cmp36 = icmp sgt i16 %N, 0 |