; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -p loop-unroll -mtriple riscv64 -mattr=+v,+f -S %s | FileCheck %s --check-prefixes=COMMON,CHECK ; RUN: opt -p loop-unroll -mtriple=riscv64 -mcpu=sifive-p870 -S %s | FileCheck %s --check-prefixes=COMMON,SIFIVE define void @reverse(ptr %dst, ptr %src, i64 %len) { ; CHECK-LABEL: define void @reverse( ; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[IV]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]] ; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[LEN]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; ; SIFIVE-LABEL: define void @reverse( ; SIFIVE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] { ; SIFIVE-NEXT: [[ENTRY:.*]]: ; SIFIVE-NEXT: [[TMP2:%.*]] = add i64 [[LEN]], -1 ; SIFIVE-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 7 ; SIFIVE-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 7 ; SIFIVE-NEXT: br i1 [[TMP3]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[ENTRY_NEW:.*]] ; SIFIVE: [[ENTRY_NEW]]: ; SIFIVE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]] ; SIFIVE-NEXT: br label %[[FOR_BODY:.*]] ; SIFIVE: [[FOR_BODY]]: ; SIFIVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[FOR_BODY]] ] ; SIFIVE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ] ; SIFIVE-NEXT: [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[IV]] ; SIFIVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]] ; SIFIVE-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 ; SIFIVE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]] ; SIFIVE-NEXT: store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16 ; SIFIVE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 ; SIFIVE-NEXT: [[TMP4:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT]] ; SIFIVE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP4]] ; SIFIVE-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16 ; SIFIVE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT]] ; SIFIVE-NEXT: store <4 x float> [[TMP5]], ptr [[ARRAYIDX2_1]], align 16 ; SIFIVE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 ; SIFIVE-NEXT: [[TMP6:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_1]] ; SIFIVE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP6]] ; SIFIVE-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16 ; SIFIVE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_1]] ; SIFIVE-NEXT: store <4 x float> [[TMP7]], ptr [[ARRAYIDX2_2]], align 16 ; SIFIVE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 ; SIFIVE-NEXT: [[TMP8:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_2]] ; SIFIVE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP8]] ; SIFIVE-NEXT: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16 ; SIFIVE-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_2]] ; SIFIVE-NEXT: store <4 x float> [[TMP9]], ptr [[ARRAYIDX2_3]], align 16 ; SIFIVE-NEXT: [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV]], 4 ; SIFIVE-NEXT: [[TMP10:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_3]] ; SIFIVE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP10]] ; SIFIVE-NEXT: [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16 ; SIFIVE-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_3]] ; SIFIVE-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX2_4]], align 16 ; SIFIVE-NEXT: [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV]], 5 ; SIFIVE-NEXT: [[TMP12:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_4]] ; SIFIVE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP12]] ; SIFIVE-NEXT: [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16 ; SIFIVE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_4]] ; SIFIVE-NEXT: store <4 x float> [[TMP13]], ptr [[ARRAYIDX2_5]], align 16 ; SIFIVE-NEXT: [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV]], 6 ; SIFIVE-NEXT: [[TMP14:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_5]] ; SIFIVE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP14]] ; SIFIVE-NEXT: [[TMP15:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16 ; SIFIVE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_5]] ; SIFIVE-NEXT: store <4 x float> [[TMP15]], ptr [[ARRAYIDX2_6]], align 16 ; SIFIVE-NEXT: [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV]], 7 ; SIFIVE-NEXT: [[TMP16:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_6]] ; SIFIVE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP16]] ; SIFIVE-NEXT: [[TMP17:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16 ; SIFIVE-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_6]] ; SIFIVE-NEXT: store <4 x float> [[TMP17]], ptr [[ARRAYIDX2_7]], align 16 ; SIFIVE-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8 ; SIFIVE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8 ; SIFIVE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]] ; SIFIVE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA:.*]], label %[[FOR_BODY]] ; SIFIVE: [[EXIT_UNR_LCSSA]]: ; SIFIVE-NEXT: [[IV_UNR1:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[FOR_BODY]] ] ; SIFIVE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 ; SIFIVE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER]], label %[[EXIT:.*]] ; SIFIVE: [[FOR_BODY_EPIL_PREHEADER]]: ; SIFIVE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR1]], %[[EXIT_UNR_LCSSA]] ] ; SIFIVE-NEXT: [[LCMP_MOD1:%.*]] = icmp ne i64 [[XTRAITER]], 0 ; SIFIVE-NEXT: call void @llvm.assume(i1 [[LCMP_MOD1]]) ; SIFIVE-NEXT: br label %[[FOR_BODY_EPIL:.*]] ; SIFIVE: [[FOR_BODY_EPIL]]: ; SIFIVE-NEXT: [[TMP18:%.*]] = sub nsw i64 [[LEN]], [[IV_UNR]] ; SIFIVE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP18]] ; SIFIVE-NEXT: [[TMP19:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16 ; SIFIVE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_UNR]] ; SIFIVE-NEXT: store <4 x float> [[TMP19]], ptr [[ARRAYIDX2_EPIL]], align 16 ; SIFIVE-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_UNR]], 1 ; SIFIVE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 1, [[XTRAITER]] ; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL_1:.*]], label %[[EXIT_EPILOG_LCSSA:.*]] ; SIFIVE: [[FOR_BODY_EPIL_1]]: ; SIFIVE-NEXT: [[TMP20:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL]] ; SIFIVE-NEXT: [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP20]] ; SIFIVE-NEXT: [[TMP21:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_1]], align 16 ; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL]] ; SIFIVE-NEXT: store <4 x float> [[TMP21]], ptr [[ARRAYIDX2_EPIL_1]], align 16 ; SIFIVE-NEXT: [[IV_NEXT_EPIL_1:%.*]] = add nuw nsw i64 [[IV_UNR]], 2 ; SIFIVE-NEXT: [[EPIL_ITER_CMP_1:%.*]] = icmp ne i64 2, [[XTRAITER]] ; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP_1]], label %[[FOR_BODY_EPIL_2:.*]], label %[[EXIT_EPILOG_LCSSA]] ; SIFIVE: [[FOR_BODY_EPIL_2]]: ; SIFIVE-NEXT: [[TMP22:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_1]] ; SIFIVE-NEXT: [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP22]] ; SIFIVE-NEXT: [[TMP23:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_2]], align 16 ; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_1]] ; SIFIVE-NEXT: store <4 x float> [[TMP23]], ptr [[ARRAYIDX2_EPIL_2]], align 16 ; SIFIVE-NEXT: [[IV_NEXT_EPIL_2:%.*]] = add nuw nsw i64 [[IV_UNR]], 3 ; SIFIVE-NEXT: [[EPIL_ITER_CMP_2:%.*]] = icmp ne i64 3, [[XTRAITER]] ; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP_2]], label %[[FOR_BODY_EPIL_3:.*]], label %[[EXIT_EPILOG_LCSSA]] ; SIFIVE: [[FOR_BODY_EPIL_3]]: ; SIFIVE-NEXT: [[TMP24:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_2]] ; SIFIVE-NEXT: [[ARRAYIDX_EPIL_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP24]] ; SIFIVE-NEXT: [[TMP25:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_3]], align 16 ; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_2]] ; SIFIVE-NEXT: store <4 x float> [[TMP25]], ptr [[ARRAYIDX2_EPIL_3]], align 16 ; SIFIVE-NEXT: [[IV_NEXT_EPIL_3:%.*]] = add nuw nsw i64 [[IV_UNR]], 4 ; SIFIVE-NEXT: [[EPIL_ITER_CMP_3:%.*]] = icmp ne i64 4, [[XTRAITER]] ; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP_3]], label %[[FOR_BODY_EPIL_4:.*]], label %[[EXIT_EPILOG_LCSSA]] ; SIFIVE: [[FOR_BODY_EPIL_4]]: ; SIFIVE-NEXT: [[TMP26:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_3]] ; SIFIVE-NEXT: [[ARRAYIDX_EPIL_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP26]] ; SIFIVE-NEXT: [[TMP27:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_4]], align 16 ; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_3]] ; SIFIVE-NEXT: store <4 x float> [[TMP27]], ptr [[ARRAYIDX2_EPIL_4]], align 16 ; SIFIVE-NEXT: [[IV_NEXT_EPIL_4:%.*]] = add nuw nsw i64 [[IV_UNR]], 5 ; SIFIVE-NEXT: [[EPIL_ITER_CMP_4:%.*]] = icmp ne i64 5, [[XTRAITER]] ; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP_4]], label %[[FOR_BODY_EPIL_5:.*]], label %[[EXIT_EPILOG_LCSSA]] ; SIFIVE: [[FOR_BODY_EPIL_5]]: ; SIFIVE-NEXT: [[TMP28:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_4]] ; SIFIVE-NEXT: [[ARRAYIDX_EPIL_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP28]] ; SIFIVE-NEXT: [[TMP29:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_5]], align 16 ; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_4]] ; SIFIVE-NEXT: store <4 x float> [[TMP29]], ptr [[ARRAYIDX2_EPIL_5]], align 16 ; SIFIVE-NEXT: [[IV_NEXT_EPIL_5:%.*]] = add nuw nsw i64 [[IV_UNR]], 6 ; SIFIVE-NEXT: [[EPIL_ITER_CMP_5:%.*]] = icmp ne i64 6, [[XTRAITER]] ; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP_5]], label %[[FOR_BODY_EPIL_6:.*]], label %[[EXIT_EPILOG_LCSSA]] ; SIFIVE: [[FOR_BODY_EPIL_6]]: ; SIFIVE-NEXT: [[TMP30:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_5]] ; SIFIVE-NEXT: [[ARRAYIDX_EPIL_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP30]] ; SIFIVE-NEXT: [[TMP31:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_6]], align 16 ; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_5]] ; SIFIVE-NEXT: store <4 x float> [[TMP31]], ptr [[ARRAYIDX2_EPIL_6]], align 16 ; SIFIVE-NEXT: br label %[[EXIT_EPILOG_LCSSA]] ; SIFIVE: [[EXIT_EPILOG_LCSSA]]: ; SIFIVE-NEXT: br label %[[EXIT]] ; SIFIVE: [[EXIT]]: ; SIFIVE-NEXT: ret void ; entry: ; preds = %entry br label %for.body for.body: ; preds = %entry, %for.body %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] %1 = sub nsw i64 %len, %iv %arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1 %2 = load <4 x float>, ptr %arrayidx, align 16 %arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %iv store <4 x float> %2, ptr %arrayidx2, align 16 %iv.next = add nuw nsw i64 %iv, 1 %exitcond.not = icmp eq i64 %iv.next, %len br i1 %exitcond.not, label %exit, label %for.body exit: ; preds = %for.body, %entry ret void } define void @saxpy_tripcount8_full_unroll(ptr %dst, ptr %src, float %a) { ; COMMON-LABEL: define void @saxpy_tripcount8_full_unroll( ; COMMON-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0:[0-9]+]] { ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0 ; COMMON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; COMMON-NEXT: br label %[[VECTOR_BODY:.*]] ; COMMON: [[VECTOR_BODY]]: ; COMMON-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4 ; COMMON-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4 ; COMMON-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]]) ; COMMON-NEXT: store <4 x float> [[TMP0]], ptr [[DST]], align 4 ; COMMON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4 ; COMMON-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 ; COMMON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4 ; COMMON-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 ; COMMON-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]]) ; COMMON-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4 ; COMMON-NEXT: ret void ; entry: %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0 %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds nuw float, ptr %src, i64 %index %wide.load = load <4 x float>, ptr %0, align 4 %1 = getelementptr inbounds nuw float, ptr %dst, i64 %index %wide.load12 = load <4 x float>, ptr %1, align 4 %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12) store <4 x float> %2, ptr %1, align 4 %index.next = add nuw i64 %index, 4 %3 = icmp eq i64 %index.next, 8 br i1 %3, label %exit, label %vector.body exit: ; preds = %vector.body ret void } define void @saxpy_tripcount1K_av0(ptr %dst, ptr %src, float %a) { ; CHECK-LABEL: define void @saxpy_tripcount1K_av0( ; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]]) ; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; ; SIFIVE-LABEL: define void @saxpy_tripcount1K_av0( ; SIFIVE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] { ; SIFIVE-NEXT: [[ENTRY:.*]]: ; SIFIVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0 ; SIFIVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; SIFIVE-NEXT: br label %[[VECTOR_BODY:.*]] ; SIFIVE: [[VECTOR_BODY]]: ; SIFIVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT_15:%.*]], %[[VECTOR_BODY]] ] ; SIFIVE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]] ; SIFIVE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4 ; SIFIVE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]] ; SIFIVE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 ; SIFIVE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]]) ; SIFIVE-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4 ; SIFIVE-NEXT: [[INDEX_NEXT1:%.*]] = add nuw nsw i64 [[INDEX]], 4 ; SIFIVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT1]] ; SIFIVE-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP12]], align 4 ; SIFIVE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT1]] ; SIFIVE-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 ; SIFIVE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]]) ; SIFIVE-NEXT: store <4 x float> [[TMP5]], ptr [[TMP4]], align 4 ; SIFIVE-NEXT: [[INDEX_NEXT_1:%.*]] = add nuw nsw i64 [[INDEX]], 8 ; SIFIVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_1]] ; SIFIVE-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x float>, ptr [[TMP6]], align 4 ; SIFIVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_1]] ; SIFIVE-NEXT: [[WIDE_LOAD12_2:%.*]] = load <4 x float>, ptr [[TMP7]], align 4 ; SIFIVE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_2]], <4 x float> [[WIDE_LOAD12_2]]) ; SIFIVE-NEXT: store <4 x float> [[TMP8]], ptr [[TMP7]], align 4 ; SIFIVE-NEXT: [[INDEX_NEXT_2:%.*]] = add nuw nsw i64 [[INDEX]], 12 ; SIFIVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_2]] ; SIFIVE-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x float>, ptr [[TMP9]], align 4 ; SIFIVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_2]] ; SIFIVE-NEXT: [[WIDE_LOAD12_3:%.*]] = load <4 x float>, ptr [[TMP10]], align 4 ; SIFIVE-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_3]], <4 x float> [[WIDE_LOAD12_3]]) ; SIFIVE-NEXT: store <4 x float> [[TMP11]], ptr [[TMP10]], align 4 ; SIFIVE-NEXT: [[INDEX_NEXT:%.*]] = add nuw nsw i64 [[INDEX]], 16 ; SIFIVE-NEXT: [[TMP49:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT]] ; SIFIVE-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x float>, ptr [[TMP49]], align 4 ; SIFIVE-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT]] ; SIFIVE-NEXT: [[WIDE_LOAD12_4:%.*]] = load <4 x float>, ptr [[TMP13]], align 4 ; SIFIVE-NEXT: [[TMP14:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_4]], <4 x float> [[WIDE_LOAD12_4]]) ; SIFIVE-NEXT: store <4 x float> [[TMP14]], ptr [[TMP13]], align 4 ; SIFIVE-NEXT: [[INDEX_NEXT_4:%.*]] = add nuw nsw i64 [[INDEX]], 20 ; SIFIVE-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_4]] ; SIFIVE-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x float>, ptr [[TMP15]], align 4 ; SIFIVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_4]] ; SIFIVE-NEXT: [[WIDE_LOAD12_5:%.*]] = load <4 x float>, ptr [[TMP16]], align 4 ; SIFIVE-NEXT: [[TMP17:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_5]], <4 x float> [[WIDE_LOAD12_5]]) ; SIFIVE-NEXT: store <4 x float> [[TMP17]], ptr [[TMP16]], align 4 ; SIFIVE-NEXT: [[INDEX_NEXT_5:%.*]] = add nuw nsw i64 [[INDEX]], 24 ; SIFIVE-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_5]] ; SIFIVE-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x float>, ptr [[TMP18]], align 4 ; SIFIVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_5]] ; SIFIVE-NEXT: [[WIDE_LOAD12_6:%.*]] = load <4 x float>, ptr [[TMP19]], align 4 ; SIFIVE-NEXT: [[TMP20:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_6]], <4 x float> [[WIDE_LOAD12_6]]) ; SIFIVE-NEXT: store <4 x float> [[TMP20]], ptr [[TMP19]], align 4 ; SIFIVE-NEXT: [[INDEX_NEXT_6:%.*]] = add nuw nsw i64 [[INDEX]], 28 ; SIFIVE-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_6]] ; SIFIVE-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x float>, ptr [[TMP21]], align 4 ; SIFIVE-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_6]] ; SIFIVE-NEXT: [[WIDE_LOAD12_7:%.*]] = load <4 x float>, ptr [[TMP22]], align 4 ; SIFIVE-NEXT: [[TMP23:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_7]], <4 x float> [[WIDE_LOAD12_7]]) ; SIFIVE-NEXT: store <4 x float> [[TMP23]], ptr [[TMP22]], align 4 ; SIFIVE-NEXT: [[INDEX_NEXT_7:%.*]] = add nuw nsw i64 [[INDEX]], 32 ; SIFIVE-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_7]] ; SIFIVE-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x float>, ptr [[TMP24]], align 4 ; SIFIVE-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_7]] ; SIFIVE-NEXT: [[WIDE_LOAD12_8:%.*]] = load <4 x float>, ptr [[TMP25]], align 4 ; SIFIVE-NEXT: [[TMP26:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_8]], <4 x float> [[WIDE_LOAD12_8]]) ; SIFIVE-NEXT: store <4 x float> [[TMP26]], ptr [[TMP25]], align 4 ; SIFIVE-NEXT: [[INDEX_NEXT_8:%.*]] = add nuw nsw i64 [[INDEX]], 36 ; SIFIVE-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_8]] ; SIFIVE-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x float>, ptr [[TMP27]], align 4 ; SIFIVE-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_8]] ; SIFIVE-NEXT: [[WIDE_LOAD12_9:%.*]] = load <4 x float>, ptr [[TMP28]], align 4 ; SIFIVE-NEXT: [[TMP29:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_9]], <4 x float> [[WIDE_LOAD12_9]]) ; SIFIVE-NEXT: store <4 x float> [[TMP29]], ptr [[TMP28]], align 4 ; SIFIVE-NEXT: [[INDEX_NEXT_9:%.*]] = add nuw nsw i64 [[INDEX]], 40 ; SIFIVE-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_9]] ; SIFIVE-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x float>, ptr [[TMP30]], align 4 ; SIFIVE-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_9]] ; SIFIVE-NEXT: [[WIDE_LOAD12_10:%.*]] = load <4 x float>, ptr [[TMP31]], align 4 ; SIFIVE-NEXT: [[TMP32:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_10]], <4 x float> [[WIDE_LOAD12_10]]) ; SIFIVE-NEXT: store <4 x float> [[TMP32]], ptr [[TMP31]], align 4 ; SIFIVE-NEXT: [[INDEX_NEXT_10:%.*]] = add nuw nsw i64 [[INDEX]], 44 ; SIFIVE-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_10]] ; SIFIVE-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x float>, ptr [[TMP33]], align 4 ; SIFIVE-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_10]] ; SIFIVE-NEXT: [[WIDE_LOAD12_11:%.*]] = load <4 x float>, ptr [[TMP34]], align 4 ; SIFIVE-NEXT: [[TMP35:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_11]], <4 x float> [[WIDE_LOAD12_11]]) ; SIFIVE-NEXT: store <4 x float> [[TMP35]], ptr [[TMP34]], align 4 ; SIFIVE-NEXT: [[INDEX_NEXT_11:%.*]] = add nuw nsw i64 [[INDEX]], 48 ; SIFIVE-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_11]] ; SIFIVE-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x float>, ptr [[TMP36]], align 4 ; SIFIVE-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_11]] ; SIFIVE-NEXT: [[WIDE_LOAD12_12:%.*]] = load <4 x float>, ptr [[TMP37]], align 4 ; SIFIVE-NEXT: [[TMP38:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_12]], <4 x float> [[WIDE_LOAD12_12]]) ; SIFIVE-NEXT: store <4 x float> [[TMP38]], ptr [[TMP37]], align 4 ; SIFIVE-NEXT: [[INDEX_NEXT_12:%.*]] = add nuw nsw i64 [[INDEX]], 52 ; SIFIVE-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_12]] ; SIFIVE-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x float>, ptr [[TMP39]], align 4 ; SIFIVE-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_12]] ; SIFIVE-NEXT: [[WIDE_LOAD12_13:%.*]] = load <4 x float>, ptr [[TMP40]], align 4 ; SIFIVE-NEXT: [[TMP41:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_13]], <4 x float> [[WIDE_LOAD12_13]]) ; SIFIVE-NEXT: store <4 x float> [[TMP41]], ptr [[TMP40]], align 4 ; SIFIVE-NEXT: [[INDEX_NEXT_13:%.*]] = add nuw nsw i64 [[INDEX]], 56 ; SIFIVE-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_13]] ; SIFIVE-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x float>, ptr [[TMP42]], align 4 ; SIFIVE-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_13]] ; SIFIVE-NEXT: [[WIDE_LOAD12_14:%.*]] = load <4 x float>, ptr [[TMP43]], align 4 ; SIFIVE-NEXT: [[TMP44:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_14]], <4 x float> [[WIDE_LOAD12_14]]) ; SIFIVE-NEXT: store <4 x float> [[TMP44]], ptr [[TMP43]], align 4 ; SIFIVE-NEXT: [[INDEX_NEXT_14:%.*]] = add nuw nsw i64 [[INDEX]], 60 ; SIFIVE-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_14]] ; SIFIVE-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x float>, ptr [[TMP45]], align 4 ; SIFIVE-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_14]] ; SIFIVE-NEXT: [[WIDE_LOAD12_15:%.*]] = load <4 x float>, ptr [[TMP46]], align 4 ; SIFIVE-NEXT: [[TMP47:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_15]], <4 x float> [[WIDE_LOAD12_15]]) ; SIFIVE-NEXT: store <4 x float> [[TMP47]], ptr [[TMP46]], align 4 ; SIFIVE-NEXT: [[INDEX_NEXT_15]] = add nuw nsw i64 [[INDEX]], 64 ; SIFIVE-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT_15]], 1024 ; SIFIVE-NEXT: br i1 [[TMP48]], label %[[EXIT:.*]], label %[[VECTOR_BODY]] ; SIFIVE: [[EXIT]]: ; SIFIVE-NEXT: ret void ; entry: %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0 %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds nuw float, ptr %src, i64 %index %wide.load = load <4 x float>, ptr %0, align 4 %1 = getelementptr inbounds nuw float, ptr %dst, i64 %index %wide.load12 = load <4 x float>, ptr %1, align 4 %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12) store <4 x float> %2, ptr %1, align 4 %index.next = add nuw i64 %index, 4 %3 = icmp eq i64 %index.next, 1024 br i1 %3, label %exit, label %vector.body exit: ; preds = %vector.body ret void } define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) { ; COMMON-LABEL: define void @saxpy_tripcount1K_av1( ; COMMON-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] { ; COMMON-NEXT: [[ENTRY:.*]]: ; COMMON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0 ; COMMON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; COMMON-NEXT: br label %[[VECTOR_BODY:.*]] ; COMMON: [[VECTOR_BODY]]: ; COMMON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; COMMON-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]] ; COMMON-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4 ; COMMON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]] ; COMMON-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 ; COMMON-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]]) ; COMMON-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4 ; COMMON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; COMMON-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; COMMON-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; COMMON: [[EXIT]]: ; COMMON-NEXT: ret void ; entry: %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0 %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds nuw float, ptr %src, i64 %index %wide.load = load <4 x float>, ptr %0, align 4 %1 = getelementptr inbounds nuw float, ptr %dst, i64 %index %wide.load12 = load <4 x float>, ptr %1, align 4 %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12) store <4 x float> %2, ptr %1, align 4 %index.next = add nuw i64 %index, 4 %3 = icmp eq i64 %index.next, 1024 br i1 %3, label %exit, label %vector.body, !llvm.loop !0 exit: ; preds = %vector.body ret void } ; On SiFive we should runtime unroll the scalar epilogue loop, but not the ; vector loop. define void @scalar_epilogue(ptr %p, i8 %splat.scalar, i64 %n) { ; CHECK-LABEL: define void @scalar_epilogue( ; CHECK-SAME: ptr [[P:%.*]], i8 [[SPLAT_SCALAR:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_REMAINDER_PREHEADER:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[SPLAT_SCALAR]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[GEP_P_IV:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[IV]] ; CHECK-NEXT: [[GEP_P_IV_16:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_P_IV]], i64 16 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[GEP_P_IV]], align 1 ; CHECK-NEXT: [[WIDE_LOAD_2:%.*]] = load <16 x i8>, ptr [[GEP_P_IV_16]], align 1 ; CHECK-NEXT: [[ADD_BROADCAST:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[ADD_BROADCAST_2:%.*]] = add <16 x i8> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: store <16 x i8> [[ADD_BROADCAST]], ptr [[GEP_P_IV]], align 1 ; CHECK-NEXT: store <16 x i8> [[ADD_BROADCAST_2]], ptr [[GEP_P_IV_16]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 32 ; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER]] ; CHECK: [[SCALAR_REMAINDER_PREHEADER]]: ; CHECK-NEXT: [[IV_SCALAR_LOOP_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label %[[SCALAR_REMAINDER:.*]] ; CHECK: [[SCALAR_REMAINDER]]: ; CHECK-NEXT: [[IV_SCALAR_LOOP:%.*]] = phi i64 [ [[INC:%.*]], %[[SCALAR_REMAINDER]] ], [ [[IV_SCALAR_LOOP_PH]], %[[SCALAR_REMAINDER_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[IV_SCALAR_LOOP]] ; CHECK-NEXT: [[SCALAR_LOAD:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ADD:%.*]] = add i8 [[SCALAR_LOAD]], [[SPLAT_SCALAR]] ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[INC]] = add nuw i64 [[IV_SCALAR_LOOP]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_REMAINDER]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT_LOOPEXIT]]: ; CHECK-NEXT: br label %[[EXIT]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; ; SIFIVE-LABEL: define void @scalar_epilogue( ; SIFIVE-SAME: ptr [[P:%.*]], i8 [[SPLAT_SCALAR:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; SIFIVE-NEXT: [[ENTRY:.*]]: ; SIFIVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 ; SIFIVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_REMAINDER_PREHEADER:.*]], label %[[VECTOR_PH:.*]] ; SIFIVE: [[VECTOR_PH]]: ; SIFIVE-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32 ; SIFIVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[SPLAT_SCALAR]], i64 0 ; SIFIVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer ; SIFIVE-NEXT: br label %[[VECTOR_BODY:.*]] ; SIFIVE: [[VECTOR_BODY]]: ; SIFIVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ] ; SIFIVE-NEXT: [[GEP_P_IV:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[IV]] ; SIFIVE-NEXT: [[GEP_P_IV_16:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_P_IV]], i64 16 ; SIFIVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[GEP_P_IV]], align 1 ; SIFIVE-NEXT: [[WIDE_LOAD_2:%.*]] = load <16 x i8>, ptr [[GEP_P_IV_16]], align 1 ; SIFIVE-NEXT: [[ADD_BROADCAST:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; SIFIVE-NEXT: [[ADD_BROADCAST_2:%.*]] = add <16 x i8> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] ; SIFIVE-NEXT: store <16 x i8> [[ADD_BROADCAST]], ptr [[GEP_P_IV]], align 1 ; SIFIVE-NEXT: store <16 x i8> [[ADD_BROADCAST_2]], ptr [[GEP_P_IV_16]], align 1 ; SIFIVE-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 32 ; SIFIVE-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; SIFIVE-NEXT: br i1 [[EXIT_COND]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; SIFIVE: [[MIDDLE_BLOCK]]: ; SIFIVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; SIFIVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER]] ; SIFIVE: [[SCALAR_REMAINDER_PREHEADER]]: ; SIFIVE-NEXT: [[IV_SCALAR_LOOP:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ] ; SIFIVE-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_SCALAR_LOOP]] ; SIFIVE-NEXT: [[TMP1:%.*]] = add i64 [[N]], -1 ; SIFIVE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], [[IV_SCALAR_LOOP]] ; SIFIVE-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP0]], 7 ; SIFIVE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 ; SIFIVE-NEXT: br i1 [[LCMP_MOD]], label %[[SCALAR_REMAINDER_PROL_PREHEADER:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT:.*]] ; SIFIVE: [[SCALAR_REMAINDER_PROL_PREHEADER]]: ; SIFIVE-NEXT: br label %[[SCALAR_REMAINDER_PROL:.*]] ; SIFIVE: [[SCALAR_REMAINDER_PROL]]: ; SIFIVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[IV_SCALAR_LOOP]] ; SIFIVE-NEXT: [[SCALAR_LOAD:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; SIFIVE-NEXT: [[ADD:%.*]] = add i8 [[SCALAR_LOAD]], [[SPLAT_SCALAR]] ; SIFIVE-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1 ; SIFIVE-NEXT: [[INC:%.*]] = add nuw i64 [[IV_SCALAR_LOOP]], 1 ; SIFIVE-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i64 1, [[XTRAITER]] ; SIFIVE-NEXT: br i1 [[PROL_ITER_CMP]], label %[[SCALAR_REMAINDER_PROL_1:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA:.*]] ; SIFIVE: [[SCALAR_REMAINDER_PROL_1]]: ; SIFIVE-NEXT: [[ARRAYIDX_PROL_1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC]] ; SIFIVE-NEXT: [[SCALAR_LOAD_PROL_1:%.*]] = load i8, ptr [[ARRAYIDX_PROL_1]], align 1 ; SIFIVE-NEXT: [[ADD_PROL_1:%.*]] = add i8 [[SCALAR_LOAD_PROL_1]], [[SPLAT_SCALAR]] ; SIFIVE-NEXT: store i8 [[ADD_PROL_1]], ptr [[ARRAYIDX_PROL_1]], align 1 ; SIFIVE-NEXT: [[INC_PROL_1:%.*]] = add nuw i64 [[IV_SCALAR_LOOP]], 2 ; SIFIVE-NEXT: [[PROL_ITER_CMP_1:%.*]] = icmp ne i64 2, [[XTRAITER]] ; SIFIVE-NEXT: br i1 [[PROL_ITER_CMP_1]], label %[[SCALAR_REMAINDER_PROL_2:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]] ; SIFIVE: [[SCALAR_REMAINDER_PROL_2]]: ; SIFIVE-NEXT: [[ARRAYIDX_PROL_2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL_1]] ; SIFIVE-NEXT: [[SCALAR_LOAD_PROL_2:%.*]] = load i8, ptr [[ARRAYIDX_PROL_2]], align 1 ; SIFIVE-NEXT: [[ADD_PROL_2:%.*]] = add i8 [[SCALAR_LOAD_PROL_2]], [[SPLAT_SCALAR]] ; SIFIVE-NEXT: store i8 [[ADD_PROL_2]], ptr [[ARRAYIDX_PROL_2]], align 1 ; SIFIVE-NEXT: [[INC_PROL_2:%.*]] = add nuw i64 [[IV_SCALAR_LOOP]], 3 ; SIFIVE-NEXT: [[PROL_ITER_CMP_2:%.*]] = icmp ne i64 3, [[XTRAITER]] ; SIFIVE-NEXT: br i1 [[PROL_ITER_CMP_2]], label %[[SCALAR_REMAINDER_PROL_3:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]] ; SIFIVE: [[SCALAR_REMAINDER_PROL_3]]: ; SIFIVE-NEXT: [[ARRAYIDX_PROL_3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL_2]] ; SIFIVE-NEXT: [[SCALAR_LOAD_PROL_3:%.*]] = load i8, ptr [[ARRAYIDX_PROL_3]], align 1 ; SIFIVE-NEXT: [[ADD_PROL_3:%.*]] = add i8 [[SCALAR_LOAD_PROL_3]], [[SPLAT_SCALAR]] ; SIFIVE-NEXT: store i8 [[ADD_PROL_3]], ptr [[ARRAYIDX_PROL_3]], align 1 ; SIFIVE-NEXT: [[INC_PROL_3:%.*]] = add nuw i64 [[IV_SCALAR_LOOP]], 4 ; SIFIVE-NEXT: [[PROL_ITER_CMP_3:%.*]] = icmp ne i64 4, [[XTRAITER]] ; SIFIVE-NEXT: br i1 [[PROL_ITER_CMP_3]], label %[[SCALAR_REMAINDER_PROL_4:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]] ; SIFIVE: [[SCALAR_REMAINDER_PROL_4]]: ; SIFIVE-NEXT: [[ARRAYIDX_PROL_4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL_3]] ; SIFIVE-NEXT: [[SCALAR_LOAD_PROL_4:%.*]] = load i8, ptr [[ARRAYIDX_PROL_4]], align 1 ; SIFIVE-NEXT: [[ADD_PROL_4:%.*]] = add i8 [[SCALAR_LOAD_PROL_4]], [[SPLAT_SCALAR]] ; SIFIVE-NEXT: store i8 [[ADD_PROL_4]], ptr [[ARRAYIDX_PROL_4]], align 1 ; SIFIVE-NEXT: [[INC_PROL_4:%.*]] = add nuw i64 [[IV_SCALAR_LOOP]], 5 ; SIFIVE-NEXT: [[PROL_ITER_CMP_4:%.*]] = icmp ne i64 5, [[XTRAITER]] ; SIFIVE-NEXT: br i1 [[PROL_ITER_CMP_4]], label %[[SCALAR_REMAINDER_PROL_5:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]] ; SIFIVE: [[SCALAR_REMAINDER_PROL_5]]: ; SIFIVE-NEXT: [[ARRAYIDX_PROL_5:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL_4]] ; SIFIVE-NEXT: [[SCALAR_LOAD_PROL_5:%.*]] = load i8, ptr [[ARRAYIDX_PROL_5]], align 1 ; SIFIVE-NEXT: [[ADD_PROL_5:%.*]] = add i8 [[SCALAR_LOAD_PROL_5]], [[SPLAT_SCALAR]] ; SIFIVE-NEXT: store i8 [[ADD_PROL_5]], ptr [[ARRAYIDX_PROL_5]], align 1 ; SIFIVE-NEXT: [[INC_PROL_5:%.*]] = add nuw i64 [[IV_SCALAR_LOOP]], 6 ; SIFIVE-NEXT: [[PROL_ITER_CMP_5:%.*]] = icmp ne i64 6, [[XTRAITER]] ; SIFIVE-NEXT: br i1 [[PROL_ITER_CMP_5]], label %[[SCALAR_REMAINDER_PROL_6:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]] ; SIFIVE: [[SCALAR_REMAINDER_PROL_6]]: ; SIFIVE-NEXT: [[ARRAYIDX_PROL_6:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL_5]] ; SIFIVE-NEXT: [[SCALAR_LOAD_PROL_6:%.*]] = load i8, ptr [[ARRAYIDX_PROL_6]], align 1 ; SIFIVE-NEXT: [[ADD_PROL_6:%.*]] = add i8 [[SCALAR_LOAD_PROL_6]], [[SPLAT_SCALAR]] ; SIFIVE-NEXT: store i8 [[ADD_PROL_6]], ptr [[ARRAYIDX_PROL_6]], align 1 ; SIFIVE-NEXT: [[INC_PROL_6:%.*]] = add nuw i64 [[IV_SCALAR_LOOP]], 7 ; SIFIVE-NEXT: br label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]] ; SIFIVE: [[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]: ; SIFIVE-NEXT: [[IV_SCALAR_LOOP_UNR_PH:%.*]] = phi i64 [ [[INC]], %[[SCALAR_REMAINDER_PROL]] ], [ [[INC_PROL_1]], %[[SCALAR_REMAINDER_PROL_1]] ], [ [[INC_PROL_2]], %[[SCALAR_REMAINDER_PROL_2]] ], [ [[INC_PROL_3]], %[[SCALAR_REMAINDER_PROL_3]] ], [ [[INC_PROL_4]], %[[SCALAR_REMAINDER_PROL_4]] ], [ [[INC_PROL_5]], %[[SCALAR_REMAINDER_PROL_5]] ], [ [[INC_PROL_6]], %[[SCALAR_REMAINDER_PROL_6]] ] ; SIFIVE-NEXT: br label %[[SCALAR_REMAINDER_PROL_LOOPEXIT]] ; SIFIVE: [[SCALAR_REMAINDER_PROL_LOOPEXIT]]: ; SIFIVE-NEXT: [[IV_SCALAR_LOOP_UNR:%.*]] = phi i64 [ [[IV_SCALAR_LOOP]], %[[SCALAR_REMAINDER_PREHEADER]] ], [ [[IV_SCALAR_LOOP_UNR_PH]], %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]] ] ; SIFIVE-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 7 ; SIFIVE-NEXT: br i1 [[TMP3]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER_NEW:.*]] ; SIFIVE: [[SCALAR_REMAINDER_PREHEADER_NEW]]: ; SIFIVE-NEXT: br label %[[SCALAR_REMAINDER:.*]] ; SIFIVE: [[SCALAR_REMAINDER]]: ; SIFIVE-NEXT: [[IV_SCALAR_LOOP1:%.*]] = phi i64 [ [[IV_SCALAR_LOOP_UNR]], %[[SCALAR_REMAINDER_PREHEADER_NEW]] ], [ [[INC_7:%.*]], %[[SCALAR_REMAINDER]] ] ; SIFIVE-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[IV_SCALAR_LOOP1]] ; SIFIVE-NEXT: [[SCALAR_LOAD1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; SIFIVE-NEXT: [[ADD1:%.*]] = add i8 [[SCALAR_LOAD1]], [[SPLAT_SCALAR]] ; SIFIVE-NEXT: store i8 [[ADD1]], ptr [[ARRAYIDX1]], align 1 ; SIFIVE-NEXT: [[INC1:%.*]] = add nuw i64 [[IV_SCALAR_LOOP1]], 1 ; SIFIVE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC1]] ; SIFIVE-NEXT: [[SCALAR_LOAD_1:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1 ; SIFIVE-NEXT: [[ADD_1:%.*]] = add i8 [[SCALAR_LOAD_1]], [[SPLAT_SCALAR]] ; SIFIVE-NEXT: store i8 [[ADD_1]], ptr [[ARRAYIDX_1]], align 1 ; SIFIVE-NEXT: [[INC_1:%.*]] = add nuw i64 [[IV_SCALAR_LOOP1]], 2 ; SIFIVE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_1]] ; SIFIVE-NEXT: [[SCALAR_LOAD_2:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1 ; SIFIVE-NEXT: [[ADD_2:%.*]] = add i8 [[SCALAR_LOAD_2]], [[SPLAT_SCALAR]] ; SIFIVE-NEXT: store i8 [[ADD_2]], ptr [[ARRAYIDX_2]], align 1 ; SIFIVE-NEXT: [[INC_2:%.*]] = add nuw i64 [[IV_SCALAR_LOOP1]], 3 ; SIFIVE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_2]] ; SIFIVE-NEXT: [[SCALAR_LOAD_3:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1 ; SIFIVE-NEXT: [[ADD_3:%.*]] = add i8 [[SCALAR_LOAD_3]], [[SPLAT_SCALAR]] ; SIFIVE-NEXT: store i8 [[ADD_3]], ptr [[ARRAYIDX_3]], align 1 ; SIFIVE-NEXT: [[INC_3:%.*]] = add nuw i64 [[IV_SCALAR_LOOP1]], 4 ; SIFIVE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_3]] ; SIFIVE-NEXT: [[SCALAR_LOAD_4:%.*]] = load i8, ptr [[ARRAYIDX_4]], align 1 ; SIFIVE-NEXT: [[ADD_4:%.*]] = add i8 [[SCALAR_LOAD_4]], [[SPLAT_SCALAR]] ; SIFIVE-NEXT: store i8 [[ADD_4]], ptr [[ARRAYIDX_4]], align 1 ; SIFIVE-NEXT: [[INC_4:%.*]] = add nuw i64 [[IV_SCALAR_LOOP1]], 5 ; SIFIVE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_4]] ; SIFIVE-NEXT: [[SCALAR_LOAD_5:%.*]] = load i8, ptr [[ARRAYIDX_5]], align 1 ; SIFIVE-NEXT: [[ADD_5:%.*]] = add i8 [[SCALAR_LOAD_5]], [[SPLAT_SCALAR]] ; SIFIVE-NEXT: store i8 [[ADD_5]], ptr [[ARRAYIDX_5]], align 1 ; SIFIVE-NEXT: [[INC_5:%.*]] = add nuw i64 [[IV_SCALAR_LOOP1]], 6 ; SIFIVE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_5]] ; SIFIVE-NEXT: [[SCALAR_LOAD_6:%.*]] = load i8, ptr [[ARRAYIDX_6]], align 1 ; SIFIVE-NEXT: [[ADD_6:%.*]] = add i8 [[SCALAR_LOAD_6]], [[SPLAT_SCALAR]] ; SIFIVE-NEXT: store i8 [[ADD_6]], ptr [[ARRAYIDX_6]], align 1 ; SIFIVE-NEXT: [[INC_6:%.*]] = add nuw i64 [[IV_SCALAR_LOOP1]], 7 ; SIFIVE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_6]] ; SIFIVE-NEXT: [[SCALAR_LOAD_7:%.*]] = load i8, ptr [[ARRAYIDX_7]], align 1 ; SIFIVE-NEXT: [[ADD_7:%.*]] = add i8 [[SCALAR_LOAD_7]], [[SPLAT_SCALAR]] ; SIFIVE-NEXT: store i8 [[ADD_7]], ptr [[ARRAYIDX_7]], align 1 ; SIFIVE-NEXT: [[INC_7]] = add nuw i64 [[IV_SCALAR_LOOP1]], 8 ; SIFIVE-NEXT: [[EXITCOND_NOT_7:%.*]] = icmp eq i64 [[INC_7]], [[N]] ; SIFIVE-NEXT: br i1 [[EXITCOND_NOT_7]], label %[[EXIT_LOOPEXIT_UNR_LCSSA:.*]], label %[[SCALAR_REMAINDER]], !llvm.loop [[LOOP3:![0-9]+]] ; SIFIVE: [[EXIT_LOOPEXIT_UNR_LCSSA]]: ; SIFIVE-NEXT: br label %[[EXIT_LOOPEXIT]] ; SIFIVE: [[EXIT_LOOPEXIT]]: ; SIFIVE-NEXT: br label %[[EXIT]] ; SIFIVE: [[EXIT]]: ; SIFIVE-NEXT: ret void ; entry: %min.iters.check = icmp ult i64 %n, 32 br i1 %min.iters.check, label %scalar.remainder, label %vector.ph vector.ph: %n.vec = and i64 %n, -32 %broadcast.splatinsert = insertelement <16 x i8> poison, i8 %splat.scalar, i64 0 %broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> poison, <16 x i32> zeroinitializer br label %vector.body vector.body: %iv = phi i64 [ 0, %vector.ph ], [ %iv.next, %vector.body ] %gep.p.iv = getelementptr inbounds nuw i8, ptr %p, i64 %iv %gep.p.iv.16 = getelementptr inbounds nuw i8, ptr %gep.p.iv, i64 16 %wide.load = load <16 x i8>, ptr %gep.p.iv, align 1 %wide.load.2 = load <16 x i8>, ptr %gep.p.iv.16, align 1 %add.broadcast = add <16 x i8> %wide.load, %broadcast.splat %add.broadcast.2 = add <16 x i8> %wide.load.2, %broadcast.splat store <16 x i8> %add.broadcast, ptr %gep.p.iv, align 1 store <16 x i8> %add.broadcast.2, ptr %gep.p.iv.16, align 1 %iv.next = add nuw i64 %iv, 32 %exit.cond = icmp eq i64 %iv.next, %n.vec br i1 %exit.cond, label %middle.block, label %vector.body, !llvm.loop !2 middle.block: %cmp.n = icmp eq i64 %n, %n.vec br i1 %cmp.n, label %exit, label %scalar.remainder scalar.remainder: %iv.scalar.loop = phi i64 [ %inc, %scalar.remainder ], [ %n.vec, %middle.block ], [ 0, %entry ] %arrayidx = getelementptr inbounds nuw i8, ptr %p, i64 %iv.scalar.loop %scalar.load = load i8, ptr %arrayidx, align 1 %add = add i8 %scalar.load, %splat.scalar store i8 %add, ptr %arrayidx, align 1 %inc = add nuw i64 %iv.scalar.loop, 1 %exitcond.not = icmp eq i64 %inc, %n br i1 %exitcond.not, label %exit, label %scalar.remainder, !llvm.loop !3 exit: ret void } define void @vector_operands(ptr %p, i64 %n) { ; COMMON-LABEL: define void @vector_operands( ; COMMON-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; COMMON-NEXT: [[ENTRY:.*]]: ; COMMON-NEXT: br label %[[VECTOR_BODY:.*]] ; COMMON: [[VECTOR_BODY]]: ; COMMON-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; COMMON-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[ENTRY]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; COMMON-NEXT: [[VL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; COMMON-NEXT: [[ADDR:%.*]] = getelementptr i64, ptr [[P]], i64 [[EVL_BASED_IV]] ; COMMON-NEXT: call void @llvm.vp.store.nxv2i64.p0( zeroinitializer, ptr align 8 [[ADDR]], splat (i1 true), i32 [[VL]]) ; COMMON-NEXT: [[VL_ZEXT:%.*]] = zext i32 [[VL]] to i64 ; COMMON-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[VL_ZEXT]], [[EVL_BASED_IV]] ; COMMON-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[VL_ZEXT]] ; COMMON-NEXT: [[TMP0:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; COMMON-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; COMMON: [[EXIT]]: ; COMMON-NEXT: ret void ; entry: br label %vector.body vector.body: %evl.based.iv = phi i64 [ 0, %entry ], [ %index.evl.next, %vector.body ] %avl = phi i64 [ %n, %entry ], [ %avl.next, %vector.body ] %vl = call i32 @llvm.experimental.get.vector.length.i64(i64 %avl, i32 2, i1 true) %addr = getelementptr i64, ptr %p, i64 %evl.based.iv call void @llvm.vp.store.nxv2i64.p0( splat (i64 0), ptr align 8 %addr, splat (i1 true), i32 %vl) %vl.zext = zext i32 %vl to i64 %index.evl.next = add nuw i64 %vl.zext, %evl.based.iv %avl.next = sub nuw i64 %avl, %vl.zext %3 = icmp eq i64 %avl.next, 0 br i1 %3, label %exit, label %vector.body, !llvm.loop !2 exit: ret void } !0 = !{!0, !1} !1 = !{!"llvm.loop.isvectorized", i32 1} !2 = distinct !{!2, !1} !3 = distinct !{!3, !1} ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]} ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} ;. ; SIFIVE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} ; SIFIVE: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; SIFIVE: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]} ; SIFIVE: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} ;.