; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s define void @arm_cmplx_mag_squared_f16(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: arm_cmplx_mag_squared_f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: beq .LBB0_8 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader ; CHECK-NEXT: cmp r2, #8 ; CHECK-NEXT: blo .LBB0_9 ; CHECK-NEXT: @ %bb.2: @ %vector.memcheck ; CHECK-NEXT: add.w r3, r0, r2, lsl #2 ; CHECK-NEXT: cmp r3, r1 ; CHECK-NEXT: itt hi ; CHECK-NEXT: addhi.w r3, r1, r2, lsl #1 ; CHECK-NEXT: cmphi r3, r0 ; CHECK-NEXT: bhi .LBB0_9 ; CHECK-NEXT: @ %bb.3: @ %vector.ph ; CHECK-NEXT: bic r4, r2, #7 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r3, r4, #8 ; CHECK-NEXT: add.w r12, r1, r4, lsl #1 ; CHECK-NEXT: add.w lr, r5, r3, lsr #3 ; CHECK-NEXT: add.w r3, r0, r4, lsl #2 ; CHECK-NEXT: and r5, r2, #7 ; CHECK-NEXT: .LBB0_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vld20.16 {q0, q1}, [r0] ; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! ; CHECK-NEXT: vmul.f16 q0, q0, q0 ; CHECK-NEXT: vfma.f16 q0, q1, q1 ; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: le lr, .LBB0_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r4, r2 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r7, pc} ; CHECK-NEXT: .LBB0_6: @ %while.body.preheader26 ; CHECK-NEXT: dls lr, r5 ; CHECK-NEXT: .LBB0_7: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr.16 s0, [r3] ; CHECK-NEXT: vldr.16 s2, [r3, #2] ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vmul.f16 s0, s0, s0 ; CHECK-NEXT: vfma.f16 s0, s2, s2 ; CHECK-NEXT: vstr.16 s0, [r12] ; CHECK-NEXT: add.w r12, r12, #2 ; CHECK-NEXT: le lr, .LBB0_7 ; CHECK-NEXT: .LBB0_8: @ %while.end ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .LBB0_9: ; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: mov r12, r1 ; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB0_6 entry: %cmp.not11 = icmp eq i32 %numSamples, 0 br i1 %cmp.not11, label %while.end, label %while.body.preheader while.body.preheader: ; preds = %entry %min.iters.check = icmp ult i32 %numSamples, 8 br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck vector.memcheck: ; preds = %while.body.preheader %scevgep = getelementptr half, ptr %pDst, i32 %numSamples %0 = shl i32 %numSamples, 1 %scevgep18 = getelementptr half, ptr %pSrc, i32 %0 %bound0 = icmp ugt ptr %scevgep18, %pDst %bound1 = icmp ugt ptr %scevgep, %pSrc %found.conflict = and i1 %bound0, %bound1 br i1 %found.conflict, label %while.body.preheader26, label %vector.ph vector.ph: ; preds = %vector.memcheck %n.vec = and i32 %numSamples, -8 %1 = shl i32 %n.vec, 1 %ind.end = getelementptr half, ptr %pSrc, i32 %1 %ind.end21 = getelementptr half, ptr %pDst, i32 %n.vec %ind.end23 = and i32 %numSamples, 7 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %2 = shl i32 %index, 1 %next.gep = getelementptr half, ptr %pSrc, i32 %2 %next.gep24 = getelementptr half, ptr %pDst, i32 %index %wide.vec = load <16 x half>, ptr %next.gep, align 2 %3 = fmul fast <16 x half> %wide.vec, %wide.vec %4 = shufflevector <16 x half> %3, <16 x half> undef, <8 x i32> %5 = fmul fast <16 x half> %wide.vec, %wide.vec %6 = shufflevector <16 x half> %5, <16 x half> undef, <8 x i32> %7 = fadd fast <8 x half> %6, %4 store <8 x half> %7, ptr %next.gep24, align 2 %index.next = add i32 %index, 8 %8 = icmp eq i32 %index.next, %n.vec br i1 %8, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %cmp.n = icmp eq i32 %n.vec, %numSamples br i1 %cmp.n, label %while.end, label %while.body.preheader26 while.body.preheader26: ; preds = %middle.block, %vector.memcheck, %while.body.preheader %pSrc.addr.014.ph = phi ptr [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ] %pDst.addr.013.ph = phi ptr [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ] %blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ] br label %while.body while.body: ; preds = %while.body.preheader26, %while.body %pSrc.addr.014 = phi ptr [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ] %pDst.addr.013 = phi ptr [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ] %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ] %incdec.ptr = getelementptr inbounds half, ptr %pSrc.addr.014, i32 1 %9 = load half, ptr %pSrc.addr.014, align 2 %incdec.ptr1 = getelementptr inbounds half, ptr %pSrc.addr.014, i32 2 %10 = load half, ptr %incdec.ptr, align 2 %mul = fmul fast half %9, %9 %mul2 = fmul fast half %10, %10 %add = fadd fast half %mul2, %mul %incdec.ptr3 = getelementptr inbounds half, ptr %pDst.addr.013, i32 1 store half %add, ptr %pDst.addr.013, align 2 %dec = add i32 %blkCnt.012, -1 %cmp.not = icmp eq i32 %dec, 0 br i1 %cmp.not, label %while.end, label %while.body while.end: ; preds = %while.body, %middle.block, %entry ret void } define void @arm_cmplx_mag_squared_f32(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: arm_cmplx_mag_squared_f32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: cbz r2, .LBB1_8 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader ; CHECK-NEXT: cmp r2, #4 ; CHECK-NEXT: blo .LBB1_9 ; CHECK-NEXT: @ %bb.2: @ %vector.memcheck ; CHECK-NEXT: add.w r3, r0, r2, lsl #3 ; CHECK-NEXT: cmp r3, r1 ; CHECK-NEXT: itt hi ; CHECK-NEXT: addhi.w r3, r1, r2, lsl #2 ; CHECK-NEXT: cmphi r3, r0 ; CHECK-NEXT: bhi .LBB1_9 ; CHECK-NEXT: @ %bb.3: @ %vector.ph ; CHECK-NEXT: bic r4, r2, #3 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: subs r3, r4, #4 ; CHECK-NEXT: add.w r12, r1, r4, lsl #2 ; CHECK-NEXT: add.w lr, r5, r3, lsr #2 ; CHECK-NEXT: add.w r3, r0, r4, lsl #3 ; CHECK-NEXT: and r5, r2, #3 ; CHECK-NEXT: .LBB1_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vld20.32 {q0, q1}, [r0] ; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! ; CHECK-NEXT: vmul.f32 q0, q0, q0 ; CHECK-NEXT: vfma.f32 q0, q1, q1 ; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: le lr, .LBB1_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r4, r2 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r7, pc} ; CHECK-NEXT: .LBB1_6: @ %while.body.preheader26 ; CHECK-NEXT: dls lr, r5 ; CHECK-NEXT: .LBB1_7: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr s0, [r3] ; CHECK-NEXT: vldr s2, [r3, #4] ; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vmul.f32 s0, s0, s0 ; CHECK-NEXT: vfma.f32 s0, s2, s2 ; CHECK-NEXT: vstmia r12!, {s0} ; CHECK-NEXT: le lr, .LBB1_7 ; CHECK-NEXT: .LBB1_8: @ %while.end ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .LBB1_9: ; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: mov r12, r1 ; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB1_6 entry: %cmp.not11 = icmp eq i32 %numSamples, 0 br i1 %cmp.not11, label %while.end, label %while.body.preheader while.body.preheader: ; preds = %entry %min.iters.check = icmp ult i32 %numSamples, 4 br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck vector.memcheck: ; preds = %while.body.preheader %scevgep = getelementptr float, ptr %pDst, i32 %numSamples %0 = shl i32 %numSamples, 1 %scevgep18 = getelementptr float, ptr %pSrc, i32 %0 %bound0 = icmp ugt ptr %scevgep18, %pDst %bound1 = icmp ugt ptr %scevgep, %pSrc %found.conflict = and i1 %bound0, %bound1 br i1 %found.conflict, label %while.body.preheader26, label %vector.ph vector.ph: ; preds = %vector.memcheck %n.vec = and i32 %numSamples, -4 %1 = shl i32 %n.vec, 1 %ind.end = getelementptr float, ptr %pSrc, i32 %1 %ind.end21 = getelementptr float, ptr %pDst, i32 %n.vec %ind.end23 = and i32 %numSamples, 3 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %2 = shl i32 %index, 1 %next.gep = getelementptr float, ptr %pSrc, i32 %2 %next.gep24 = getelementptr float, ptr %pDst, i32 %index %wide.vec = load <8 x float>, ptr %next.gep, align 4 %3 = fmul fast <8 x float> %wide.vec, %wide.vec %4 = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> %5 = fmul fast <8 x float> %wide.vec, %wide.vec %6 = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> %7 = fadd fast <4 x float> %6, %4 store <4 x float> %7, ptr %next.gep24, align 4 %index.next = add i32 %index, 4 %8 = icmp eq i32 %index.next, %n.vec br i1 %8, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %cmp.n = icmp eq i32 %n.vec, %numSamples br i1 %cmp.n, label %while.end, label %while.body.preheader26 while.body.preheader26: ; preds = %middle.block, %vector.memcheck, %while.body.preheader %pSrc.addr.014.ph = phi ptr [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ] %pDst.addr.013.ph = phi ptr [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ] %blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ] br label %while.body while.body: ; preds = %while.body.preheader26, %while.body %pSrc.addr.014 = phi ptr [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ] %pDst.addr.013 = phi ptr [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ] %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ] %incdec.ptr = getelementptr inbounds float, ptr %pSrc.addr.014, i32 1 %9 = load float, ptr %pSrc.addr.014, align 4 %incdec.ptr1 = getelementptr inbounds float, ptr %pSrc.addr.014, i32 2 %10 = load float, ptr %incdec.ptr, align 4 %mul = fmul fast float %9, %9 %mul2 = fmul fast float %10, %10 %add = fadd fast float %mul2, %mul %incdec.ptr3 = getelementptr inbounds float, ptr %pDst.addr.013, i32 1 store float %add, ptr %pDst.addr.013, align 4 %dec = add i32 %blkCnt.012, -1 %cmp.not = icmp eq i32 %dec, 0 br i1 %cmp.not, label %while.end, label %while.body while.end: ; preds = %while.body, %middle.block, %entry ret void } define void @arm_cmplx_mag_squared_f16_cse(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: arm_cmplx_mag_squared_f16_cse: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: beq .LBB2_8 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader ; CHECK-NEXT: cmp r2, #8 ; CHECK-NEXT: blo .LBB2_9 ; CHECK-NEXT: @ %bb.2: @ %vector.memcheck ; CHECK-NEXT: add.w r3, r0, r2, lsl #2 ; CHECK-NEXT: cmp r3, r1 ; CHECK-NEXT: itt hi ; CHECK-NEXT: addhi.w r3, r1, r2, lsl #1 ; CHECK-NEXT: cmphi r3, r0 ; CHECK-NEXT: bhi .LBB2_9 ; CHECK-NEXT: @ %bb.3: @ %vector.ph ; CHECK-NEXT: bic r4, r2, #7 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r3, r4, #8 ; CHECK-NEXT: add.w r12, r1, r4, lsl #1 ; CHECK-NEXT: add.w lr, r5, r3, lsr #3 ; CHECK-NEXT: add.w r3, r0, r4, lsl #2 ; CHECK-NEXT: and r5, r2, #7 ; CHECK-NEXT: .LBB2_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vld20.16 {q0, q1}, [r0] ; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! ; CHECK-NEXT: vmul.f16 q0, q0, q0 ; CHECK-NEXT: vfma.f16 q0, q1, q1 ; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: le lr, .LBB2_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r4, r2 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r7, pc} ; CHECK-NEXT: .LBB2_6: @ %while.body.preheader26 ; CHECK-NEXT: dls lr, r5 ; CHECK-NEXT: .LBB2_7: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr.16 s0, [r3] ; CHECK-NEXT: vldr.16 s2, [r3, #2] ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vmul.f16 s0, s0, s0 ; CHECK-NEXT: vfma.f16 s0, s2, s2 ; CHECK-NEXT: vstr.16 s0, [r12] ; CHECK-NEXT: add.w r12, r12, #2 ; CHECK-NEXT: le lr, .LBB2_7 ; CHECK-NEXT: .LBB2_8: @ %while.end ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .LBB2_9: ; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: mov r12, r1 ; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB2_6 entry: %cmp.not11 = icmp eq i32 %numSamples, 0 br i1 %cmp.not11, label %while.end, label %while.body.preheader while.body.preheader: ; preds = %entry %min.iters.check = icmp ult i32 %numSamples, 8 br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck vector.memcheck: ; preds = %while.body.preheader %scevgep = getelementptr half, ptr %pDst, i32 %numSamples %0 = shl i32 %numSamples, 1 %scevgep18 = getelementptr half, ptr %pSrc, i32 %0 %bound0 = icmp ugt ptr %scevgep18, %pDst %bound1 = icmp ugt ptr %scevgep, %pSrc %found.conflict = and i1 %bound0, %bound1 br i1 %found.conflict, label %while.body.preheader26, label %vector.ph vector.ph: ; preds = %vector.memcheck %n.vec = and i32 %numSamples, -8 %1 = shl i32 %n.vec, 1 %ind.end = getelementptr half, ptr %pSrc, i32 %1 %ind.end21 = getelementptr half, ptr %pDst, i32 %n.vec %ind.end23 = and i32 %numSamples, 7 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %2 = shl i32 %index, 1 %next.gep = getelementptr half, ptr %pSrc, i32 %2 %next.gep24 = getelementptr half, ptr %pDst, i32 %index %wide.vec = load <16 x half>, ptr %next.gep, align 2 %3 = fmul fast <16 x half> %wide.vec, %wide.vec %4 = shufflevector <16 x half> %3, <16 x half> undef, <8 x i32> %5 = shufflevector <16 x half> %3, <16 x half> undef, <8 x i32> %6 = fadd fast <8 x half> %5, %4 store <8 x half> %6, ptr %next.gep24, align 2 %index.next = add i32 %index, 8 %7 = icmp eq i32 %index.next, %n.vec br i1 %7, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %cmp.n = icmp eq i32 %n.vec, %numSamples br i1 %cmp.n, label %while.end, label %while.body.preheader26 while.body.preheader26: ; preds = %middle.block, %vector.memcheck, %while.body.preheader %pSrc.addr.014.ph = phi ptr [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ] %pDst.addr.013.ph = phi ptr [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ] %blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ] br label %while.body while.body: ; preds = %while.body, %while.body.preheader26 %pSrc.addr.014 = phi ptr [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ] %pDst.addr.013 = phi ptr [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ] %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ] %incdec.ptr = getelementptr inbounds half, ptr %pSrc.addr.014, i32 1 %8 = load half, ptr %pSrc.addr.014, align 2 %incdec.ptr1 = getelementptr inbounds half, ptr %pSrc.addr.014, i32 2 %9 = load half, ptr %incdec.ptr, align 2 %mul = fmul fast half %8, %8 %mul2 = fmul fast half %9, %9 %add = fadd fast half %mul2, %mul %incdec.ptr3 = getelementptr inbounds half, ptr %pDst.addr.013, i32 1 store half %add, ptr %pDst.addr.013, align 2 %dec = add i32 %blkCnt.012, -1 %cmp.not = icmp eq i32 %dec, 0 br i1 %cmp.not, label %while.end, label %while.body while.end: ; preds = %while.body, %middle.block, %entry ret void } define void @arm_cmplx_mag_squared_f32_cse(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: arm_cmplx_mag_squared_f32_cse: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: cbz r2, .LBB3_8 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader ; CHECK-NEXT: cmp r2, #4 ; CHECK-NEXT: blo .LBB3_9 ; CHECK-NEXT: @ %bb.2: @ %vector.memcheck ; CHECK-NEXT: add.w r3, r0, r2, lsl #3 ; CHECK-NEXT: cmp r3, r1 ; CHECK-NEXT: itt hi ; CHECK-NEXT: addhi.w r3, r1, r2, lsl #2 ; CHECK-NEXT: cmphi r3, r0 ; CHECK-NEXT: bhi .LBB3_9 ; CHECK-NEXT: @ %bb.3: @ %vector.ph ; CHECK-NEXT: bic r4, r2, #3 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: subs r3, r4, #4 ; CHECK-NEXT: add.w r12, r1, r4, lsl #2 ; CHECK-NEXT: add.w lr, r5, r3, lsr #2 ; CHECK-NEXT: add.w r3, r0, r4, lsl #3 ; CHECK-NEXT: and r5, r2, #3 ; CHECK-NEXT: .LBB3_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vld20.32 {q0, q1}, [r0] ; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! ; CHECK-NEXT: vmul.f32 q0, q0, q0 ; CHECK-NEXT: vfma.f32 q0, q1, q1 ; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: le lr, .LBB3_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r4, r2 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r7, pc} ; CHECK-NEXT: .LBB3_6: @ %while.body.preheader26 ; CHECK-NEXT: dls lr, r5 ; CHECK-NEXT: .LBB3_7: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr s0, [r3] ; CHECK-NEXT: vldr s2, [r3, #4] ; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vmul.f32 s0, s0, s0 ; CHECK-NEXT: vfma.f32 s0, s2, s2 ; CHECK-NEXT: vstmia r12!, {s0} ; CHECK-NEXT: le lr, .LBB3_7 ; CHECK-NEXT: .LBB3_8: @ %while.end ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .LBB3_9: ; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: mov r12, r1 ; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB3_6 entry: %cmp.not11 = icmp eq i32 %numSamples, 0 br i1 %cmp.not11, label %while.end, label %while.body.preheader while.body.preheader: ; preds = %entry %min.iters.check = icmp ult i32 %numSamples, 4 br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck vector.memcheck: ; preds = %while.body.preheader %scevgep = getelementptr float, ptr %pDst, i32 %numSamples %0 = shl i32 %numSamples, 1 %scevgep18 = getelementptr float, ptr %pSrc, i32 %0 %bound0 = icmp ugt ptr %scevgep18, %pDst %bound1 = icmp ugt ptr %scevgep, %pSrc %found.conflict = and i1 %bound0, %bound1 br i1 %found.conflict, label %while.body.preheader26, label %vector.ph vector.ph: ; preds = %vector.memcheck %n.vec = and i32 %numSamples, -4 %1 = shl i32 %n.vec, 1 %ind.end = getelementptr float, ptr %pSrc, i32 %1 %ind.end21 = getelementptr float, ptr %pDst, i32 %n.vec %ind.end23 = and i32 %numSamples, 3 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %2 = shl i32 %index, 1 %next.gep = getelementptr float, ptr %pSrc, i32 %2 %next.gep24 = getelementptr float, ptr %pDst, i32 %index %wide.vec = load <8 x float>, ptr %next.gep, align 4 %3 = fmul fast <8 x float> %wide.vec, %wide.vec %4 = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> %5 = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> %6 = fadd fast <4 x float> %5, %4 store <4 x float> %6, ptr %next.gep24, align 4 %index.next = add i32 %index, 4 %7 = icmp eq i32 %index.next, %n.vec br i1 %7, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %cmp.n = icmp eq i32 %n.vec, %numSamples br i1 %cmp.n, label %while.end, label %while.body.preheader26 while.body.preheader26: ; preds = %middle.block, %vector.memcheck, %while.body.preheader %pSrc.addr.014.ph = phi ptr [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ] %pDst.addr.013.ph = phi ptr [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ] %blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ] br label %while.body while.body: ; preds = %while.body, %while.body.preheader26 %pSrc.addr.014 = phi ptr [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ] %pDst.addr.013 = phi ptr [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ] %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ] %incdec.ptr = getelementptr inbounds float, ptr %pSrc.addr.014, i32 1 %8 = load float, ptr %pSrc.addr.014, align 4 %incdec.ptr1 = getelementptr inbounds float, ptr %pSrc.addr.014, i32 2 %9 = load float, ptr %incdec.ptr, align 4 %mul = fmul fast float %8, %8 %mul2 = fmul fast float %9, %9 %add = fadd fast float %mul2, %mul %incdec.ptr3 = getelementptr inbounds float, ptr %pDst.addr.013, i32 1 store float %add, ptr %pDst.addr.013, align 4 %dec = add i32 %blkCnt.012, -1 %cmp.not = icmp eq i32 %dec, 0 br i1 %cmp.not, label %while.end, label %while.body while.end: ; preds = %while.body, %middle.block, %entry ret void }