; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s define i8 @scalarize_v16i8(ptr %p) { ; CHECK-LABEL: scalarize_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: umov w8, v0.b[0] ; CHECK-NEXT: umov w9, v0.b[1] ; CHECK-NEXT: umov w10, v0.b[2] ; CHECK-NEXT: umov w11, v0.b[3] ; CHECK-NEXT: umov w12, v0.b[4] ; CHECK-NEXT: umov w13, v0.b[5] ; CHECK-NEXT: umov w14, v0.b[6] ; CHECK-NEXT: umov w15, v0.b[7] ; CHECK-NEXT: umov w16, v0.b[8] ; CHECK-NEXT: umov w17, v0.b[9] ; CHECK-NEXT: umov w18, v0.b[10] ; CHECK-NEXT: umov w0, v0.b[11] ; CHECK-NEXT: umov w1, v0.b[12] ; CHECK-NEXT: umov w2, v0.b[13] ; CHECK-NEXT: add w8, w8, w9 ; CHECK-NEXT: umov w3, v0.b[14] ; CHECK-NEXT: umov w4, v0.b[15] ; CHECK-NEXT: add w9, w10, w11 ; CHECK-NEXT: add w10, w12, w13 ; CHECK-NEXT: add w11, w14, w15 ; CHECK-NEXT: add w8, w8, w9 ; CHECK-NEXT: add w12, w16, w17 ; CHECK-NEXT: add w13, w18, w0 ; CHECK-NEXT: add w9, w10, w11 ; CHECK-NEXT: add w14, w1, w2 ; CHECK-NEXT: add w10, w12, w13 ; CHECK-NEXT: add w8, w8, w9 ; CHECK-NEXT: add w15, w3, w4 ; CHECK-NEXT: add w11, w14, w15 ; CHECK-NEXT: add w9, w10, w11 ; CHECK-NEXT: add w0, w8, w9 ; CHECK-NEXT: ret %wide.load = load <16 x i8>, ptr %p, align 4 %l0 = extractelement <16 x i8> %wide.load, i32 0 %l1 = extractelement <16 x i8> %wide.load, i32 1 %l2 = extractelement <16 x i8> %wide.load, i32 2 %l3 = extractelement <16 x i8> %wide.load, i32 3 %l4 = extractelement <16 x i8> %wide.load, i32 4 %l5 = extractelement <16 x i8> %wide.load, i32 5 %l6 = extractelement <16 x i8> %wide.load, i32 6 %l7 = extractelement <16 x i8> %wide.load, i32 7 %l8 = extractelement <16 x i8> %wide.load, i32 8 %l9 = extractelement <16 x i8> %wide.load, i32 9 %l10 = extractelement <16 x i8> %wide.load, i32 10 %l11 = extractelement <16 x i8> %wide.load, i32 11 %l12 = extractelement <16 x i8> %wide.load, i32 12 %l13 = extractelement <16 x i8> %wide.load, i32 13 %l14 = extractelement <16 x i8> %wide.load, i32 14 %l15 = extractelement <16 x i8> %wide.load, i32 15 %a0 = add i8 %l0, %l1 %a1 = add i8 %l2, %l3 %a2 = add i8 %l4, %l5 %a3 = add i8 %l6, %l7 %a4 = add i8 %l8, %l9 %a5 = add i8 %l10, %l11 %a6 = add i8 %l12, %l13 %a7 = add i8 %l14, %l15 %b0 = add i8 %a0, %a1 %b1 = add i8 %a2, %a3 %b2 = add i8 %a4, %a5 %b3 = add i8 %a6, %a7 %c0 = add i8 %b0, %b1 %c1 = add i8 %b2, %b3 %r = add i8 %c0, %c1 ret i8 %r } define i8 @scalarize_v8i8(ptr %p) { ; CHECK-LABEL: scalarize_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: umov w8, v0.b[0] ; CHECK-NEXT: umov w9, v0.b[1] ; CHECK-NEXT: umov w10, v0.b[2] ; CHECK-NEXT: umov w11, v0.b[3] ; CHECK-NEXT: umov w12, v0.b[4] ; CHECK-NEXT: umov w13, v0.b[5] ; CHECK-NEXT: umov w14, v0.b[6] ; CHECK-NEXT: umov w15, v0.b[7] ; CHECK-NEXT: add w8, w8, w9 ; CHECK-NEXT: add w9, w10, w11 ; CHECK-NEXT: add w10, w12, w13 ; CHECK-NEXT: add w11, w14, w15 ; CHECK-NEXT: add w8, w8, w9 ; CHECK-NEXT: add w9, w10, w11 ; CHECK-NEXT: add w0, w8, w9 ; CHECK-NEXT: ret %wide.load = load <8 x i8>, ptr %p, align 4 %l0 = extractelement <8 x i8> %wide.load, i32 0 %l1 = extractelement <8 x i8> %wide.load, i32 1 %l2 = extractelement <8 x i8> %wide.load, i32 2 %l3 = extractelement <8 x i8> %wide.load, i32 3 %l4 = extractelement <8 x i8> %wide.load, i32 4 %l5 = extractelement <8 x i8> %wide.load, i32 5 %l6 = extractelement <8 x i8> %wide.load, i32 6 %l7 = extractelement <8 x i8> %wide.load, i32 7 %a0 = add i8 %l0, %l1 %a1 = add i8 %l2, %l3 %a2 = add i8 %l4, %l5 %a3 = add i8 %l6, %l7 %b0 = add i8 %a0, %a1 %b1 = add i8 %a2, %a3 %r = add i8 %b0, %b1 ret i8 %r } define i16 @scalarize_v8i16(ptr %p) { ; CHECK-LABEL: scalarize_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: umov w8, v0.h[0] ; CHECK-NEXT: umov w9, v0.h[1] ; CHECK-NEXT: umov w10, v0.h[2] ; CHECK-NEXT: umov w11, v0.h[3] ; CHECK-NEXT: umov w12, v0.h[4] ; CHECK-NEXT: umov w13, v0.h[5] ; CHECK-NEXT: umov w14, v0.h[6] ; CHECK-NEXT: umov w15, v0.h[7] ; CHECK-NEXT: add w8, w8, w9 ; CHECK-NEXT: add w9, w10, w11 ; CHECK-NEXT: add w10, w12, w13 ; CHECK-NEXT: add w11, w14, w15 ; CHECK-NEXT: add w8, w8, w9 ; CHECK-NEXT: add w9, w10, w11 ; CHECK-NEXT: add w0, w8, w9 ; CHECK-NEXT: ret %wide.load = load <8 x i16>, ptr %p, align 4 %l0 = extractelement <8 x i16> %wide.load, i32 0 %l1 = extractelement <8 x i16> %wide.load, i32 1 %l2 = extractelement <8 x i16> %wide.load, i32 2 %l3 = extractelement <8 x i16> %wide.load, i32 3 %l4 = extractelement <8 x i16> %wide.load, i32 4 %l5 = extractelement <8 x i16> %wide.load, i32 5 %l6 = extractelement <8 x i16> %wide.load, i32 6 %l7 = extractelement <8 x i16> %wide.load, i32 7 %a0 = add i16 %l0, %l1 %a1 = add i16 %l2, %l3 %a2 = add i16 %l4, %l5 %a3 = add i16 %l6, %l7 %b0 = add i16 %a0, %a1 %b1 = add i16 %a2, %a3 %r = add i16 %b0, %b1 ret i16 %r } define i16 @scalarize_v4i16(ptr %p) { ; CHECK-LABEL: scalarize_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: umov w8, v0.h[0] ; CHECK-NEXT: umov w9, v0.h[1] ; CHECK-NEXT: umov w10, v0.h[2] ; CHECK-NEXT: umov w11, v0.h[3] ; CHECK-NEXT: add w8, w8, w9 ; CHECK-NEXT: add w9, w10, w11 ; CHECK-NEXT: add w0, w8, w9 ; CHECK-NEXT: ret %wide.load = load <4 x i16>, ptr %p, align 4 %l0 = extractelement <4 x i16> %wide.load, i32 0 %l1 = extractelement <4 x i16> %wide.load, i32 1 %l2 = extractelement <4 x i16> %wide.load, i32 2 %l3 = extractelement <4 x i16> %wide.load, i32 3 %a0 = add i16 %l0, %l1 %a1 = add i16 %l2, %l3 %r = add i16 %a0, %a1 ret i16 %r } define i32 @scalarize_v4i32(ptr %p) { ; CHECK-LABEL: scalarize_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: mov w9, v0.s[2] ; CHECK-NEXT: mov w10, v0.s[3] ; CHECK-NEXT: fmov w11, s0 ; CHECK-NEXT: add w8, w11, w8 ; CHECK-NEXT: add w9, w9, w10 ; CHECK-NEXT: add w0, w8, w9 ; CHECK-NEXT: ret %wide.load = load <4 x i32>, ptr %p, align 4 %l0 = extractelement <4 x i32> %wide.load, i32 0 %l1 = extractelement <4 x i32> %wide.load, i32 1 %l2 = extractelement <4 x i32> %wide.load, i32 2 %l3 = extractelement <4 x i32> %wide.load, i32 3 %a0 = add i32 %l0, %l1 %a1 = add i32 %l2, %l3 %r = add i32 %a0, %a1 ret i32 %r } define i64 @scalarize_v4i64(ptr %p) { ; CHECK-LABEL: scalarize_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: addp d1, v1.2d ; CHECK-NEXT: addp d0, v0.2d ; CHECK-NEXT: fmov x8, d1 ; CHECK-NEXT: fmov x9, d0 ; CHECK-NEXT: add x0, x8, x9 ; CHECK-NEXT: ret %wide.load = load <4 x i64>, ptr %p, align 4 %l0 = extractelement <4 x i64> %wide.load, i32 0 %l1 = extractelement <4 x i64> %wide.load, i32 1 %l2 = extractelement <4 x i64> %wide.load, i32 2 %l3 = extractelement <4 x i64> %wide.load, i32 3 %a0 = add i64 %l0, %l1 %a1 = add i64 %l2, %l3 %r = add i64 %a0, %a1 ret i64 %r } define i64 @scalarize_v4i32_sext(ptr %p) { ; CHECK-LABEL: scalarize_v4i32_sext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: sshll2 v1.2d, v0.4s, #0 ; CHECK-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-NEXT: addp d0, v0.2d ; CHECK-NEXT: addp d1, v1.2d ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: fmov x9, d1 ; CHECK-NEXT: add x0, x8, x9 ; CHECK-NEXT: ret %wide.load = load <4 x i32>, ptr %p, align 4 %ext = sext <4 x i32> %wide.load to <4 x i64> %l0 = extractelement <4 x i64> %ext, i32 0 %l1 = extractelement <4 x i64> %ext, i32 1 %l2 = extractelement <4 x i64> %ext, i32 2 %l3 = extractelement <4 x i64> %ext, i32 3 %a0 = add i64 %l0, %l1 %a1 = add i64 %l2, %l3 %r = add i64 %a0, %a1 ret i64 %r } define i64 @scalarize_v4i32_zext(ptr %p) { ; CHECK-LABEL: scalarize_v4i32_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: addp d0, v0.2d ; CHECK-NEXT: addp d1, v1.2d ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: fmov x9, d1 ; CHECK-NEXT: add x0, x8, x9 ; CHECK-NEXT: ret %wide.load = load <4 x i32>, ptr %p, align 4 %ext = zext <4 x i32> %wide.load to <4 x i64> %l0 = extractelement <4 x i64> %ext, i32 0 %l1 = extractelement <4 x i64> %ext, i32 1 %l2 = extractelement <4 x i64> %ext, i32 2 %l3 = extractelement <4 x i64> %ext, i32 3 %a0 = add i64 %l0, %l1 %a1 = add i64 %l2, %l3 %r = add i64 %a0, %a1 ret i64 %r } define half @scalarize_v4f16(ptr %p) { ; CHECK-LABEL: scalarize_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: mov h1, v0.h[1] ; CHECK-NEXT: mov h2, v0.h[2] ; CHECK-NEXT: mov h3, v0.h[3] ; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: fcvt s1, h1 ; CHECK-NEXT: fcvt s3, h3 ; CHECK-NEXT: fcvt s2, h2 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fcvt h0, s0 ; CHECK-NEXT: fcvt h1, s1 ; CHECK-NEXT: fcvt s1, h1 ; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: fcvt h0, s0 ; CHECK-NEXT: ret %wide.load = load <4 x half>, ptr %p, align 4 %l0 = extractelement <4 x half> %wide.load, i32 0 %l1 = extractelement <4 x half> %wide.load, i32 1 %l2 = extractelement <4 x half> %wide.load, i32 2 %l3 = extractelement <4 x half> %wide.load, i32 3 %a0 = fadd half %l0, %l1 %a1 = fadd half %l2, %l3 %r = fadd half %a0, %a1 ret half %r } define float @scalarize_v4f32(ptr %p) { ; CHECK-LABEL: scalarize_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: mov s1, v0.s[2] ; CHECK-NEXT: mov s2, v0.s[3] ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: fadd s1, s1, s2 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret %wide.load = load <4 x float>, ptr %p, align 4 %l0 = extractelement <4 x float> %wide.load, i32 0 %l1 = extractelement <4 x float> %wide.load, i32 1 %l2 = extractelement <4 x float> %wide.load, i32 2 %l3 = extractelement <4 x float> %wide.load, i32 3 %a0 = fadd float %l0, %l1 %a1 = fadd float %l2, %l3 %r = fadd float %a0, %a1 ret float %r } define double @scalarize_v4f64(ptr %p) { ; CHECK-LABEL: scalarize_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: faddp d1, v1.2d ; CHECK-NEXT: faddp d0, v0.2d ; CHECK-NEXT: fadd d0, d1, d0 ; CHECK-NEXT: ret %wide.load = load <4 x double>, ptr %p, align 4 %l0 = extractelement <4 x double> %wide.load, i32 0 %l1 = extractelement <4 x double> %wide.load, i32 1 %l2 = extractelement <4 x double> %wide.load, i32 2 %l3 = extractelement <4 x double> %wide.load, i32 3 %a0 = fadd double %l0, %l1 %a1 = fadd double %l2, %l3 %r = fadd double %a0, %a1 ret double %r } define float @scalarize_into_load(i64 %22, ptr %23, ptr %rawA, ptr %rawB) { ; CHECK-LABEL: scalarize_into_load: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp q1, q0, [x1] ; CHECK-NEXT: ldp q3, q2, [x1, #96] ; CHECK-NEXT: ldp q5, q4, [x1, #64] ; CHECK-NEXT: ldp q7, q6, [x1, #32] ; CHECK-NEXT: mov x8, v1.d[1] ; CHECK-NEXT: mov x10, v0.d[1] ; CHECK-NEXT: mov x1, v3.d[1] ; CHECK-NEXT: mov x4, v2.d[1] ; CHECK-NEXT: mov x16, v5.d[1] ; CHECK-NEXT: mov x18, v4.d[1] ; CHECK-NEXT: fmov x9, d1 ; CHECK-NEXT: mov x12, v7.d[1] ; CHECK-NEXT: mov x14, v6.d[1] ; CHECK-NEXT: fmov x11, d0 ; CHECK-NEXT: fmov x13, d7 ; CHECK-NEXT: fmov x15, d6 ; CHECK-NEXT: fmov x17, d5 ; CHECK-NEXT: fmov x0, d4 ; CHECK-NEXT: fmov x3, d3 ; CHECK-NEXT: fmov x5, d2 ; CHECK-NEXT: ldr s0, [x2, x9, lsl #2] ; CHECK-NEXT: ldr s1, [x2, x8, lsl #2] ; CHECK-NEXT: ldr s2, [x2, x11, lsl #2] ; CHECK-NEXT: ldr s3, [x2, x10, lsl #2] ; CHECK-NEXT: ldr s4, [x2, x13, lsl #2] ; CHECK-NEXT: ldr s5, [x2, x12, lsl #2] ; CHECK-NEXT: ldr s6, [x2, x15, lsl #2] ; CHECK-NEXT: ldr s7, [x2, x14, lsl #2] ; CHECK-NEXT: ldr s16, [x2, x17, lsl #2] ; CHECK-NEXT: ldr s17, [x2, x16, lsl #2] ; CHECK-NEXT: ldr s18, [x2, x0, lsl #2] ; CHECK-NEXT: ldr s19, [x2, x18, lsl #2] ; CHECK-NEXT: ldr s20, [x2, x3, lsl #2] ; CHECK-NEXT: ldr s21, [x2, x1, lsl #2] ; CHECK-NEXT: ldr s22, [x2, x5, lsl #2] ; CHECK-NEXT: ldr s23, [x2, x4, lsl #2] ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s2, s4, s5 ; CHECK-NEXT: fadd s3, s6, s7 ; CHECK-NEXT: fadd s4, s16, s17 ; CHECK-NEXT: fadd s5, s18, s19 ; CHECK-NEXT: fadd s6, s20, s21 ; CHECK-NEXT: fadd s7, s22, s23 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s2, s4, s5 ; CHECK-NEXT: fadd s3, s6, s7 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret entry: %wide.load = load <16 x i64>, ptr %23, align 4 %25 = extractelement <16 x i64> %wide.load, i32 0 %26 = getelementptr inbounds float, ptr %rawA, i64 %25 %27 = extractelement <16 x i64> %wide.load, i32 1 %28 = getelementptr inbounds float, ptr %rawA, i64 %27 %29 = extractelement <16 x i64> %wide.load, i32 2 %30 = getelementptr inbounds float, ptr %rawA, i64 %29 %31 = extractelement <16 x i64> %wide.load, i32 3 %32 = getelementptr inbounds float, ptr %rawA, i64 %31 %33 = extractelement <16 x i64> %wide.load, i32 4 %34 = getelementptr inbounds float, ptr %rawA, i64 %33 %35 = extractelement <16 x i64> %wide.load, i32 5 %36 = getelementptr inbounds float, ptr %rawA, i64 %35 %37 = extractelement <16 x i64> %wide.load, i32 6 %38 = getelementptr inbounds float, ptr %rawA, i64 %37 %39 = extractelement <16 x i64> %wide.load, i32 7 %40 = getelementptr inbounds float, ptr %rawA, i64 %39 %41 = extractelement <16 x i64> %wide.load, i32 8 %42 = getelementptr inbounds float, ptr %rawA, i64 %41 %43 = extractelement <16 x i64> %wide.load, i32 9 %44 = getelementptr inbounds float, ptr %rawA, i64 %43 %45 = extractelement <16 x i64> %wide.load, i32 10 %46 = getelementptr inbounds float, ptr %rawA, i64 %45 %47 = extractelement <16 x i64> %wide.load, i32 11 %48 = getelementptr inbounds float, ptr %rawA, i64 %47 %49 = extractelement <16 x i64> %wide.load, i32 12 %50 = getelementptr inbounds float, ptr %rawA, i64 %49 %51 = extractelement <16 x i64> %wide.load, i32 13 %52 = getelementptr inbounds float, ptr %rawA, i64 %51 %53 = extractelement <16 x i64> %wide.load, i32 14 %54 = getelementptr inbounds float, ptr %rawA, i64 %53 %55 = extractelement <16 x i64> %wide.load, i32 15 %56 = getelementptr inbounds float, ptr %rawA, i64 %55 %59 = load float, ptr %26, align 4 %60 = load float, ptr %28, align 4 %61 = load float, ptr %30, align 4 %62 = load float, ptr %32, align 4 %63 = load float, ptr %34, align 4 %64 = load float, ptr %36, align 4 %65 = load float, ptr %38, align 4 %66 = load float, ptr %40, align 4 %67 = load float, ptr %42, align 4 %68 = load float, ptr %44, align 4 %69 = load float, ptr %46, align 4 %70 = load float, ptr %48, align 4 %71 = load float, ptr %50, align 4 %72 = load float, ptr %52, align 4 %73 = load float, ptr %54, align 4 %74 = load float, ptr %56, align 4 %a1 = fadd float %59, %60 %a2 = fadd float %61, %62 %a3 = fadd float %63, %64 %a4 = fadd float %65, %66 %a5 = fadd float %67, %68 %a6 = fadd float %69, %70 %a7 = fadd float %71, %72 %a8 = fadd float %73, %74 %a9 = fadd float %a1, %a2 %a10 = fadd float %a3, %a4 %a11 = fadd float %a5, %a6 %a12 = fadd float %a7, %a8 %a13 = fadd float %a9, %a10 %a14 = fadd float %a11, %a12 %a15 = fadd float %a13, %a14 ret float %a15 } define float @scalarize_into_load_sext(i64 %22, ptr %23, ptr %rawA, ptr %rawB) { ; CHECK-LABEL: scalarize_into_load_sext: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp q0, q2, [x1] ; CHECK-NEXT: ldp q4, q1, [x1, #32] ; CHECK-NEXT: sshll v3.2d, v0.2s, #0 ; CHECK-NEXT: sshll2 v0.2d, v0.4s, #0 ; CHECK-NEXT: sshll2 v6.2d, v2.4s, #0 ; CHECK-NEXT: sshll2 v5.2d, v1.4s, #0 ; CHECK-NEXT: sshll v1.2d, v1.2s, #0 ; CHECK-NEXT: sshll v2.2d, v2.2s, #0 ; CHECK-NEXT: sshll2 v7.2d, v4.4s, #0 ; CHECK-NEXT: sshll v4.2d, v4.2s, #0 ; CHECK-NEXT: mov x8, v3.d[1] ; CHECK-NEXT: mov x10, v0.d[1] ; CHECK-NEXT: mov x14, v6.d[1] ; CHECK-NEXT: mov x12, v2.d[1] ; CHECK-NEXT: mov x1, v1.d[1] ; CHECK-NEXT: mov x4, v5.d[1] ; CHECK-NEXT: mov x16, v4.d[1] ; CHECK-NEXT: mov x18, v7.d[1] ; CHECK-NEXT: fmov x9, d3 ; CHECK-NEXT: fmov x11, d0 ; CHECK-NEXT: fmov x13, d2 ; CHECK-NEXT: fmov x15, d6 ; CHECK-NEXT: fmov x17, d4 ; CHECK-NEXT: fmov x0, d7 ; CHECK-NEXT: ldr s2, [x2, x8, lsl #2] ; CHECK-NEXT: fmov x3, d1 ; CHECK-NEXT: fmov x5, d5 ; CHECK-NEXT: ldr s0, [x2, x9, lsl #2] ; CHECK-NEXT: ldr s1, [x2, x11, lsl #2] ; CHECK-NEXT: ldr s3, [x2, x10, lsl #2] ; CHECK-NEXT: ldr s4, [x2, x13, lsl #2] ; CHECK-NEXT: ldr s5, [x2, x12, lsl #2] ; CHECK-NEXT: ldr s6, [x2, x15, lsl #2] ; CHECK-NEXT: ldr s7, [x2, x14, lsl #2] ; CHECK-NEXT: ldr s16, [x2, x17, lsl #2] ; CHECK-NEXT: ldr s17, [x2, x16, lsl #2] ; CHECK-NEXT: ldr s18, [x2, x0, lsl #2] ; CHECK-NEXT: ldr s19, [x2, x18, lsl #2] ; CHECK-NEXT: ldr s20, [x2, x3, lsl #2] ; CHECK-NEXT: ldr s21, [x2, x1, lsl #2] ; CHECK-NEXT: ldr s22, [x2, x5, lsl #2] ; CHECK-NEXT: ldr s23, [x2, x4, lsl #2] ; CHECK-NEXT: fadd s0, s0, s2 ; CHECK-NEXT: fadd s1, s1, s3 ; CHECK-NEXT: fadd s2, s4, s5 ; CHECK-NEXT: fadd s3, s6, s7 ; CHECK-NEXT: fadd s4, s16, s17 ; CHECK-NEXT: fadd s5, s18, s19 ; CHECK-NEXT: fadd s6, s20, s21 ; CHECK-NEXT: fadd s7, s22, s23 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s2, s4, s5 ; CHECK-NEXT: fadd s3, s6, s7 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret entry: %wide.load = load <16 x i32>, ptr %23, align 4 %24 = sext <16 x i32> %wide.load to <16 x i64> %25 = extractelement <16 x i64> %24, i32 0 %26 = getelementptr inbounds float, ptr %rawA, i64 %25 %27 = extractelement <16 x i64> %24, i32 1 %28 = getelementptr inbounds float, ptr %rawA, i64 %27 %29 = extractelement <16 x i64> %24, i32 2 %30 = getelementptr inbounds float, ptr %rawA, i64 %29 %31 = extractelement <16 x i64> %24, i32 3 %32 = getelementptr inbounds float, ptr %rawA, i64 %31 %33 = extractelement <16 x i64> %24, i32 4 %34 = getelementptr inbounds float, ptr %rawA, i64 %33 %35 = extractelement <16 x i64> %24, i32 5 %36 = getelementptr inbounds float, ptr %rawA, i64 %35 %37 = extractelement <16 x i64> %24, i32 6 %38 = getelementptr inbounds float, ptr %rawA, i64 %37 %39 = extractelement <16 x i64> %24, i32 7 %40 = getelementptr inbounds float, ptr %rawA, i64 %39 %41 = extractelement <16 x i64> %24, i32 8 %42 = getelementptr inbounds float, ptr %rawA, i64 %41 %43 = extractelement <16 x i64> %24, i32 9 %44 = getelementptr inbounds float, ptr %rawA, i64 %43 %45 = extractelement <16 x i64> %24, i32 10 %46 = getelementptr inbounds float, ptr %rawA, i64 %45 %47 = extractelement <16 x i64> %24, i32 11 %48 = getelementptr inbounds float, ptr %rawA, i64 %47 %49 = extractelement <16 x i64> %24, i32 12 %50 = getelementptr inbounds float, ptr %rawA, i64 %49 %51 = extractelement <16 x i64> %24, i32 13 %52 = getelementptr inbounds float, ptr %rawA, i64 %51 %53 = extractelement <16 x i64> %24, i32 14 %54 = getelementptr inbounds float, ptr %rawA, i64 %53 %55 = extractelement <16 x i64> %24, i32 15 %56 = getelementptr inbounds float, ptr %rawA, i64 %55 %59 = load float, ptr %26, align 4 %60 = load float, ptr %28, align 4 %61 = load float, ptr %30, align 4 %62 = load float, ptr %32, align 4 %63 = load float, ptr %34, align 4 %64 = load float, ptr %36, align 4 %65 = load float, ptr %38, align 4 %66 = load float, ptr %40, align 4 %67 = load float, ptr %42, align 4 %68 = load float, ptr %44, align 4 %69 = load float, ptr %46, align 4 %70 = load float, ptr %48, align 4 %71 = load float, ptr %50, align 4 %72 = load float, ptr %52, align 4 %73 = load float, ptr %54, align 4 %74 = load float, ptr %56, align 4 %a1 = fadd float %59, %60 %a2 = fadd float %61, %62 %a3 = fadd float %63, %64 %a4 = fadd float %65, %66 %a5 = fadd float %67, %68 %a6 = fadd float %69, %70 %a7 = fadd float %71, %72 %a8 = fadd float %73, %74 %a9 = fadd float %a1, %a2 %a10 = fadd float %a3, %a4 %a11 = fadd float %a5, %a6 %a12 = fadd float %a7, %a8 %a13 = fadd float %a9, %a10 %a14 = fadd float %a11, %a12 %a15 = fadd float %a13, %a14 ret float %a15 } define float @scalarize_into_load_zext(i64 %22, ptr %23, ptr %rawA, ptr %rawB) { ; CHECK-LABEL: scalarize_into_load_zext: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp q0, q2, [x1] ; CHECK-NEXT: ldp q4, q1, [x1, #32] ; CHECK-NEXT: ushll v3.2d, v0.2s, #0 ; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 ; CHECK-NEXT: ushll2 v6.2d, v2.4s, #0 ; CHECK-NEXT: ushll2 v5.2d, v1.4s, #0 ; CHECK-NEXT: ushll v1.2d, v1.2s, #0 ; CHECK-NEXT: ushll v2.2d, v2.2s, #0 ; CHECK-NEXT: ushll2 v7.2d, v4.4s, #0 ; CHECK-NEXT: ushll v4.2d, v4.2s, #0 ; CHECK-NEXT: mov x8, v3.d[1] ; CHECK-NEXT: mov x10, v0.d[1] ; CHECK-NEXT: mov x14, v6.d[1] ; CHECK-NEXT: mov x12, v2.d[1] ; CHECK-NEXT: mov x1, v1.d[1] ; CHECK-NEXT: mov x4, v5.d[1] ; CHECK-NEXT: mov x16, v4.d[1] ; CHECK-NEXT: mov x18, v7.d[1] ; CHECK-NEXT: fmov x9, d3 ; CHECK-NEXT: fmov x11, d0 ; CHECK-NEXT: fmov x13, d2 ; CHECK-NEXT: fmov x15, d6 ; CHECK-NEXT: fmov x17, d4 ; CHECK-NEXT: fmov x0, d7 ; CHECK-NEXT: ldr s2, [x2, x8, lsl #2] ; CHECK-NEXT: fmov x3, d1 ; CHECK-NEXT: fmov x5, d5 ; CHECK-NEXT: ldr s0, [x2, x9, lsl #2] ; CHECK-NEXT: ldr s1, [x2, x11, lsl #2] ; CHECK-NEXT: ldr s3, [x2, x10, lsl #2] ; CHECK-NEXT: ldr s4, [x2, x13, lsl #2] ; CHECK-NEXT: ldr s5, [x2, x12, lsl #2] ; CHECK-NEXT: ldr s6, [x2, x15, lsl #2] ; CHECK-NEXT: ldr s7, [x2, x14, lsl #2] ; CHECK-NEXT: ldr s16, [x2, x17, lsl #2] ; CHECK-NEXT: ldr s17, [x2, x16, lsl #2] ; CHECK-NEXT: ldr s18, [x2, x0, lsl #2] ; CHECK-NEXT: ldr s19, [x2, x18, lsl #2] ; CHECK-NEXT: ldr s20, [x2, x3, lsl #2] ; CHECK-NEXT: ldr s21, [x2, x1, lsl #2] ; CHECK-NEXT: ldr s22, [x2, x5, lsl #2] ; CHECK-NEXT: ldr s23, [x2, x4, lsl #2] ; CHECK-NEXT: fadd s0, s0, s2 ; CHECK-NEXT: fadd s1, s1, s3 ; CHECK-NEXT: fadd s2, s4, s5 ; CHECK-NEXT: fadd s3, s6, s7 ; CHECK-NEXT: fadd s4, s16, s17 ; CHECK-NEXT: fadd s5, s18, s19 ; CHECK-NEXT: fadd s6, s20, s21 ; CHECK-NEXT: fadd s7, s22, s23 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s2, s4, s5 ; CHECK-NEXT: fadd s3, s6, s7 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret entry: %wide.load = load <16 x i32>, ptr %23, align 4 %24 = zext <16 x i32> %wide.load to <16 x i64> %25 = extractelement <16 x i64> %24, i32 0 %26 = getelementptr inbounds float, ptr %rawA, i64 %25 %27 = extractelement <16 x i64> %24, i32 1 %28 = getelementptr inbounds float, ptr %rawA, i64 %27 %29 = extractelement <16 x i64> %24, i32 2 %30 = getelementptr inbounds float, ptr %rawA, i64 %29 %31 = extractelement <16 x i64> %24, i32 3 %32 = getelementptr inbounds float, ptr %rawA, i64 %31 %33 = extractelement <16 x i64> %24, i32 4 %34 = getelementptr inbounds float, ptr %rawA, i64 %33 %35 = extractelement <16 x i64> %24, i32 5 %36 = getelementptr inbounds float, ptr %rawA, i64 %35 %37 = extractelement <16 x i64> %24, i32 6 %38 = getelementptr inbounds float, ptr %rawA, i64 %37 %39 = extractelement <16 x i64> %24, i32 7 %40 = getelementptr inbounds float, ptr %rawA, i64 %39 %41 = extractelement <16 x i64> %24, i32 8 %42 = getelementptr inbounds float, ptr %rawA, i64 %41 %43 = extractelement <16 x i64> %24, i32 9 %44 = getelementptr inbounds float, ptr %rawA, i64 %43 %45 = extractelement <16 x i64> %24, i32 10 %46 = getelementptr inbounds float, ptr %rawA, i64 %45 %47 = extractelement <16 x i64> %24, i32 11 %48 = getelementptr inbounds float, ptr %rawA, i64 %47 %49 = extractelement <16 x i64> %24, i32 12 %50 = getelementptr inbounds float, ptr %rawA, i64 %49 %51 = extractelement <16 x i64> %24, i32 13 %52 = getelementptr inbounds float, ptr %rawA, i64 %51 %53 = extractelement <16 x i64> %24, i32 14 %54 = getelementptr inbounds float, ptr %rawA, i64 %53 %55 = extractelement <16 x i64> %24, i32 15 %56 = getelementptr inbounds float, ptr %rawA, i64 %55 %59 = load float, ptr %26, align 4 %60 = load float, ptr %28, align 4 %61 = load float, ptr %30, align 4 %62 = load float, ptr %32, align 4 %63 = load float, ptr %34, align 4 %64 = load float, ptr %36, align 4 %65 = load float, ptr %38, align 4 %66 = load float, ptr %40, align 4 %67 = load float, ptr %42, align 4 %68 = load float, ptr %44, align 4 %69 = load float, ptr %46, align 4 %70 = load float, ptr %48, align 4 %71 = load float, ptr %50, align 4 %72 = load float, ptr %52, align 4 %73 = load float, ptr %54, align 4 %74 = load float, ptr %56, align 4 %a1 = fadd float %59, %60 %a2 = fadd float %61, %62 %a3 = fadd float %63, %64 %a4 = fadd float %65, %66 %a5 = fadd float %67, %68 %a6 = fadd float %69, %70 %a7 = fadd float %71, %72 %a8 = fadd float %73, %74 %a9 = fadd float %a1, %a2 %a10 = fadd float %a3, %a4 %a11 = fadd float %a5, %a6 %a12 = fadd float %a7, %a8 %a13 = fadd float %a9, %a10 %a14 = fadd float %a11, %a12 %a15 = fadd float %a13, %a14 ret float %a15 }