; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s define i8 @scalarize_v16i8(ptr %p) { ; CHECK-LABEL: scalarize_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0, #3] ; CHECK-NEXT: ldrb w9, [x0, #2] ; CHECK-NEXT: ldrb w10, [x0, #1] ; CHECK-NEXT: ldrb w11, [x0] ; CHECK-NEXT: ldrb w13, [x0, #5] ; CHECK-NEXT: ldrb w14, [x0, #4] ; CHECK-NEXT: add w8, w9, w8 ; CHECK-NEXT: ldrb w12, [x0, #15] ; CHECK-NEXT: ldrb w15, [x0, #11] ; CHECK-NEXT: add w10, w11, w10 ; CHECK-NEXT: add w9, w14, w13 ; CHECK-NEXT: ldrb w11, [x0, #10] ; CHECK-NEXT: ldrb w13, [x0, #9] ; CHECK-NEXT: add w8, w10, w8 ; CHECK-NEXT: ldrb w14, [x0, #8] ; CHECK-NEXT: ldrb w16, [x0, #7] ; CHECK-NEXT: add w11, w11, w15 ; CHECK-NEXT: ldrb w17, [x0, #6] ; CHECK-NEXT: ldrb w18, [x0, #14] ; CHECK-NEXT: add w13, w14, w13 ; CHECK-NEXT: ldrb w1, [x0, #13] ; CHECK-NEXT: ldrb w0, [x0, #12] ; CHECK-NEXT: add w16, w17, w16 ; CHECK-NEXT: add w10, w13, w11 ; CHECK-NEXT: add w12, w18, w12 ; CHECK-NEXT: add w9, w9, w16 ; CHECK-NEXT: add w14, w0, w1 ; CHECK-NEXT: add w8, w8, w9 ; CHECK-NEXT: add w11, w14, w12 ; CHECK-NEXT: add w9, w10, w11 ; CHECK-NEXT: add w0, w8, w9 ; CHECK-NEXT: ret %wide.load = load <16 x i8>, ptr %p, align 4 %l0 = extractelement <16 x i8> %wide.load, i32 0 %l1 = extractelement <16 x i8> %wide.load, i32 1 %l2 = extractelement <16 x i8> %wide.load, i32 2 %l3 = extractelement <16 x i8> %wide.load, i32 3 %l4 = extractelement <16 x i8> %wide.load, i32 4 %l5 = extractelement <16 x i8> %wide.load, i32 5 %l6 = extractelement <16 x i8> %wide.load, i32 6 %l7 = extractelement <16 x i8> %wide.load, i32 7 %l8 = extractelement <16 x i8> %wide.load, i32 8 %l9 = extractelement <16 x i8> %wide.load, i32 9 %l10 = extractelement <16 x i8> %wide.load, i32 10 %l11 = extractelement <16 x i8> %wide.load, i32 11 %l12 = extractelement <16 x i8> %wide.load, i32 12 %l13 = extractelement <16 x i8> %wide.load, i32 13 %l14 = extractelement <16 x i8> %wide.load, i32 14 %l15 = extractelement <16 x i8> %wide.load, i32 15 %a0 = add i8 %l0, %l1 %a1 = add i8 %l2, %l3 %a2 = add i8 %l4, %l5 %a3 = add i8 %l6, %l7 %a4 = add i8 %l8, %l9 %a5 = add i8 %l10, %l11 %a6 = add i8 %l12, %l13 %a7 = add i8 %l14, %l15 %b0 = add i8 %a0, %a1 %b1 = add i8 %a2, %a3 %b2 = add i8 %a4, %a5 %b3 = add i8 %a6, %a7 %c0 = add i8 %b0, %b1 %c1 = add i8 %b2, %b3 %r = add i8 %c0, %c1 ret i8 %r } define i8 @scalarize_v8i8(ptr %p) { ; CHECK-LABEL: scalarize_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0, #7] ; CHECK-NEXT: ldrb w9, [x0, #6] ; CHECK-NEXT: ldrb w10, [x0, #5] ; CHECK-NEXT: ldrb w11, [x0, #1] ; CHECK-NEXT: ldrb w12, [x0] ; CHECK-NEXT: ldrb w13, [x0, #4] ; CHECK-NEXT: add w8, w9, w8 ; CHECK-NEXT: ldrb w14, [x0, #3] ; CHECK-NEXT: ldrb w15, [x0, #2] ; CHECK-NEXT: add w11, w12, w11 ; CHECK-NEXT: add w10, w13, w10 ; CHECK-NEXT: add w12, w15, w14 ; CHECK-NEXT: add w8, w10, w8 ; CHECK-NEXT: add w9, w11, w12 ; CHECK-NEXT: add w0, w9, w8 ; CHECK-NEXT: ret %wide.load = load <8 x i8>, ptr %p, align 4 %l0 = extractelement <8 x i8> %wide.load, i32 0 %l1 = extractelement <8 x i8> %wide.load, i32 1 %l2 = extractelement <8 x i8> %wide.load, i32 2 %l3 = extractelement <8 x i8> %wide.load, i32 3 %l4 = extractelement <8 x i8> %wide.load, i32 4 %l5 = extractelement <8 x i8> %wide.load, i32 5 %l6 = extractelement <8 x i8> %wide.load, i32 6 %l7 = extractelement <8 x i8> %wide.load, i32 7 %a0 = add i8 %l0, %l1 %a1 = add i8 %l2, %l3 %a2 = add i8 %l4, %l5 %a3 = add i8 %l6, %l7 %b0 = add i8 %a0, %a1 %b1 = add i8 %a2, %a3 %r = add i8 %b0, %b1 ret i8 %r } define i16 @scalarize_v8i16(ptr %p) { ; CHECK-LABEL: scalarize_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w8, [x0, #14] ; CHECK-NEXT: ldrh w9, [x0, #12] ; CHECK-NEXT: ldrh w10, [x0, #10] ; CHECK-NEXT: ldrh w11, [x0, #2] ; CHECK-NEXT: ldrh w12, [x0] ; CHECK-NEXT: ldrh w13, [x0, #8] ; CHECK-NEXT: add w8, w9, w8 ; CHECK-NEXT: ldrh w14, [x0, #6] ; CHECK-NEXT: ldrh w15, [x0, #4] ; CHECK-NEXT: add w11, w12, w11 ; CHECK-NEXT: add w10, w13, w10 ; CHECK-NEXT: add w12, w15, w14 ; CHECK-NEXT: add w8, w10, w8 ; CHECK-NEXT: add w9, w11, w12 ; CHECK-NEXT: add w0, w9, w8 ; CHECK-NEXT: ret %wide.load = load <8 x i16>, ptr %p, align 4 %l0 = extractelement <8 x i16> %wide.load, i32 0 %l1 = extractelement <8 x i16> %wide.load, i32 1 %l2 = extractelement <8 x i16> %wide.load, i32 2 %l3 = extractelement <8 x i16> %wide.load, i32 3 %l4 = extractelement <8 x i16> %wide.load, i32 4 %l5 = extractelement <8 x i16> %wide.load, i32 5 %l6 = extractelement <8 x i16> %wide.load, i32 6 %l7 = extractelement <8 x i16> %wide.load, i32 7 %a0 = add i16 %l0, %l1 %a1 = add i16 %l2, %l3 %a2 = add i16 %l4, %l5 %a3 = add i16 %l6, %l7 %b0 = add i16 %a0, %a1 %b1 = add i16 %a2, %a3 %r = add i16 %b0, %b1 ret i16 %r } define i16 @scalarize_v4i16(ptr %p) { ; CHECK-LABEL: scalarize_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w8, [x0, #6] ; CHECK-NEXT: ldrh w9, [x0, #4] ; CHECK-NEXT: ldrh w10, [x0, #2] ; CHECK-NEXT: ldrh w11, [x0] ; CHECK-NEXT: add w8, w9, w8 ; CHECK-NEXT: add w10, w11, w10 ; CHECK-NEXT: add w0, w10, w8 ; CHECK-NEXT: ret %wide.load = load <4 x i16>, ptr %p, align 4 %l0 = extractelement <4 x i16> %wide.load, i32 0 %l1 = extractelement <4 x i16> %wide.load, i32 1 %l2 = extractelement <4 x i16> %wide.load, i32 2 %l3 = extractelement <4 x i16> %wide.load, i32 3 %a0 = add i16 %l0, %l1 %a1 = add i16 %l2, %l3 %r = add i16 %a0, %a1 ret i16 %r } define i32 @scalarize_v4i32(ptr %p) { ; CHECK-LABEL: scalarize_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp w9, w8, [x0] ; CHECK-NEXT: ldp w10, w11, [x0, #8] ; CHECK-NEXT: add w8, w9, w8 ; CHECK-NEXT: add w9, w10, w11 ; CHECK-NEXT: add w0, w8, w9 ; CHECK-NEXT: ret %wide.load = load <4 x i32>, ptr %p, align 4 %l0 = extractelement <4 x i32> %wide.load, i32 0 %l1 = extractelement <4 x i32> %wide.load, i32 1 %l2 = extractelement <4 x i32> %wide.load, i32 2 %l3 = extractelement <4 x i32> %wide.load, i32 3 %a0 = add i32 %l0, %l1 %a1 = add i32 %l2, %l3 %r = add i32 %a0, %a1 ret i32 %r } define i64 @scalarize_v4i64(ptr %p) { ; CHECK-LABEL: scalarize_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp x8, x9, [x0] ; CHECK-NEXT: ldp x10, x11, [x0, #16] ; CHECK-NEXT: add x8, x8, x9 ; CHECK-NEXT: add x9, x10, x11 ; CHECK-NEXT: add x0, x8, x9 ; CHECK-NEXT: ret %wide.load = load <4 x i64>, ptr %p, align 4 %l0 = extractelement <4 x i64> %wide.load, i32 0 %l1 = extractelement <4 x i64> %wide.load, i32 1 %l2 = extractelement <4 x i64> %wide.load, i32 2 %l3 = extractelement <4 x i64> %wide.load, i32 3 %a0 = add i64 %l0, %l1 %a1 = add i64 %l2, %l3 %r = add i64 %a0, %a1 ret i64 %r } define i64 @scalarize_v4i32_sext(ptr %p) { ; CHECK-LABEL: scalarize_v4i32_sext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldpsw x9, x8, [x0, #8] ; CHECK-NEXT: ldpsw x11, x10, [x0] ; CHECK-NEXT: add x8, x9, x8 ; CHECK-NEXT: add x10, x11, x10 ; CHECK-NEXT: add x0, x10, x8 ; CHECK-NEXT: ret %wide.load = load <4 x i32>, ptr %p, align 4 %ext = sext <4 x i32> %wide.load to <4 x i64> %l0 = extractelement <4 x i64> %ext, i32 0 %l1 = extractelement <4 x i64> %ext, i32 1 %l2 = extractelement <4 x i64> %ext, i32 2 %l3 = extractelement <4 x i64> %ext, i32 3 %a0 = add i64 %l0, %l1 %a1 = add i64 %l2, %l3 %r = add i64 %a0, %a1 ret i64 %r } define i64 @scalarize_v4i32_zext(ptr %p) { ; CHECK-LABEL: scalarize_v4i32_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp w9, w8, [x0, #8] ; CHECK-NEXT: ldp w11, w10, [x0] ; CHECK-NEXT: add x8, x9, x8 ; CHECK-NEXT: add x10, x11, x10 ; CHECK-NEXT: add x0, x10, x8 ; CHECK-NEXT: ret %wide.load = load <4 x i32>, ptr %p, align 4 %ext = zext <4 x i32> %wide.load to <4 x i64> %l0 = extractelement <4 x i64> %ext, i32 0 %l1 = extractelement <4 x i64> %ext, i32 1 %l2 = extractelement <4 x i64> %ext, i32 2 %l3 = extractelement <4 x i64> %ext, i32 3 %a0 = add i64 %l0, %l1 %a1 = add i64 %l2, %l3 %r = add i64 %a0, %a1 ret i64 %r } define half @scalarize_v4f16(ptr %p) { ; CHECK-LABEL: scalarize_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: mov h1, v0.h[1] ; CHECK-NEXT: mov h2, v0.h[2] ; CHECK-NEXT: mov h3, v0.h[3] ; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: fcvt s1, h1 ; CHECK-NEXT: fcvt s3, h3 ; CHECK-NEXT: fcvt s2, h2 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fcvt h0, s0 ; CHECK-NEXT: fcvt h1, s1 ; CHECK-NEXT: fcvt s1, h1 ; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: fcvt h0, s0 ; CHECK-NEXT: ret %wide.load = load <4 x half>, ptr %p, align 4 %l0 = extractelement <4 x half> %wide.load, i32 0 %l1 = extractelement <4 x half> %wide.load, i32 1 %l2 = extractelement <4 x half> %wide.load, i32 2 %l3 = extractelement <4 x half> %wide.load, i32 3 %a0 = fadd half %l0, %l1 %a1 = fadd half %l2, %l3 %r = fadd half %a0, %a1 ret half %r } define float @scalarize_v4f32(ptr %p) { ; CHECK-LABEL: scalarize_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: mov s1, v0.s[2] ; CHECK-NEXT: mov s2, v0.s[3] ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: fadd s1, s1, s2 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret %wide.load = load <4 x float>, ptr %p, align 4 %l0 = extractelement <4 x float> %wide.load, i32 0 %l1 = extractelement <4 x float> %wide.load, i32 1 %l2 = extractelement <4 x float> %wide.load, i32 2 %l3 = extractelement <4 x float> %wide.load, i32 3 %a0 = fadd float %l0, %l1 %a1 = fadd float %l2, %l3 %r = fadd float %a0, %a1 ret float %r } define double @scalarize_v4f64(ptr %p) { ; CHECK-LABEL: scalarize_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: faddp d1, v1.2d ; CHECK-NEXT: faddp d0, v0.2d ; CHECK-NEXT: fadd d0, d1, d0 ; CHECK-NEXT: ret %wide.load = load <4 x double>, ptr %p, align 4 %l0 = extractelement <4 x double> %wide.load, i32 0 %l1 = extractelement <4 x double> %wide.load, i32 1 %l2 = extractelement <4 x double> %wide.load, i32 2 %l3 = extractelement <4 x double> %wide.load, i32 3 %a0 = fadd double %l0, %l1 %a1 = fadd double %l2, %l3 %r = fadd double %a0, %a1 ret double %r } define float @scalarize_into_load(i64 %22, ptr %23, ptr %rawA, ptr %rawB) { ; CHECK-LABEL: scalarize_into_load: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp x8, x9, [x1] ; CHECK-NEXT: ldp x10, x11, [x1, #16] ; CHECK-NEXT: ldp x12, x13, [x1, #64] ; CHECK-NEXT: ldr s0, [x2, x8, lsl #2] ; CHECK-NEXT: ldr s1, [x2, x9, lsl #2] ; CHECK-NEXT: ldp x8, x9, [x1, #32] ; CHECK-NEXT: ldr s2, [x2, x10, lsl #2] ; CHECK-NEXT: ldr s3, [x2, x11, lsl #2] ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ldr s6, [x2, x12, lsl #2] ; CHECK-NEXT: ldp x10, x11, [x1, #48] ; CHECK-NEXT: ldr s7, [x2, x13, lsl #2] ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: ldr s2, [x2, x8, lsl #2] ; CHECK-NEXT: ldr s3, [x2, x9, lsl #2] ; CHECK-NEXT: ldp x14, x15, [x1, #80] ; CHECK-NEXT: fadd s2, s2, s3 ; CHECK-NEXT: ldr s4, [x2, x10, lsl #2] ; CHECK-NEXT: ldr s5, [x2, x11, lsl #2] ; CHECK-NEXT: ldp x16, x17, [x1, #96] ; CHECK-NEXT: fadd s3, s4, s5 ; CHECK-NEXT: fadd s4, s6, s7 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ldp x18, x0, [x1, #112] ; CHECK-NEXT: ldr s16, [x2, x14, lsl #2] ; CHECK-NEXT: ldr s17, [x2, x15, lsl #2] ; CHECK-NEXT: ldr s18, [x2, x16, lsl #2] ; CHECK-NEXT: ldr s19, [x2, x17, lsl #2] ; CHECK-NEXT: ldr s20, [x2, x18, lsl #2] ; CHECK-NEXT: ldr s21, [x2, x0, lsl #2] ; CHECK-NEXT: fadd s5, s16, s17 ; CHECK-NEXT: fadd s6, s18, s19 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s7, s20, s21 ; CHECK-NEXT: fadd s2, s4, s5 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: fadd s3, s6, s7 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret entry: %wide.load = load <16 x i64>, ptr %23, align 4 %25 = extractelement <16 x i64> %wide.load, i32 0 %26 = getelementptr inbounds float, ptr %rawA, i64 %25 %27 = extractelement <16 x i64> %wide.load, i32 1 %28 = getelementptr inbounds float, ptr %rawA, i64 %27 %29 = extractelement <16 x i64> %wide.load, i32 2 %30 = getelementptr inbounds float, ptr %rawA, i64 %29 %31 = extractelement <16 x i64> %wide.load, i32 3 %32 = getelementptr inbounds float, ptr %rawA, i64 %31 %33 = extractelement <16 x i64> %wide.load, i32 4 %34 = getelementptr inbounds float, ptr %rawA, i64 %33 %35 = extractelement <16 x i64> %wide.load, i32 5 %36 = getelementptr inbounds float, ptr %rawA, i64 %35 %37 = extractelement <16 x i64> %wide.load, i32 6 %38 = getelementptr inbounds float, ptr %rawA, i64 %37 %39 = extractelement <16 x i64> %wide.load, i32 7 %40 = getelementptr inbounds float, ptr %rawA, i64 %39 %41 = extractelement <16 x i64> %wide.load, i32 8 %42 = getelementptr inbounds float, ptr %rawA, i64 %41 %43 = extractelement <16 x i64> %wide.load, i32 9 %44 = getelementptr inbounds float, ptr %rawA, i64 %43 %45 = extractelement <16 x i64> %wide.load, i32 10 %46 = getelementptr inbounds float, ptr %rawA, i64 %45 %47 = extractelement <16 x i64> %wide.load, i32 11 %48 = getelementptr inbounds float, ptr %rawA, i64 %47 %49 = extractelement <16 x i64> %wide.load, i32 12 %50 = getelementptr inbounds float, ptr %rawA, i64 %49 %51 = extractelement <16 x i64> %wide.load, i32 13 %52 = getelementptr inbounds float, ptr %rawA, i64 %51 %53 = extractelement <16 x i64> %wide.load, i32 14 %54 = getelementptr inbounds float, ptr %rawA, i64 %53 %55 = extractelement <16 x i64> %wide.load, i32 15 %56 = getelementptr inbounds float, ptr %rawA, i64 %55 %59 = load float, ptr %26, align 4 %60 = load float, ptr %28, align 4 %61 = load float, ptr %30, align 4 %62 = load float, ptr %32, align 4 %63 = load float, ptr %34, align 4 %64 = load float, ptr %36, align 4 %65 = load float, ptr %38, align 4 %66 = load float, ptr %40, align 4 %67 = load float, ptr %42, align 4 %68 = load float, ptr %44, align 4 %69 = load float, ptr %46, align 4 %70 = load float, ptr %48, align 4 %71 = load float, ptr %50, align 4 %72 = load float, ptr %52, align 4 %73 = load float, ptr %54, align 4 %74 = load float, ptr %56, align 4 %a1 = fadd float %59, %60 %a2 = fadd float %61, %62 %a3 = fadd float %63, %64 %a4 = fadd float %65, %66 %a5 = fadd float %67, %68 %a6 = fadd float %69, %70 %a7 = fadd float %71, %72 %a8 = fadd float %73, %74 %a9 = fadd float %a1, %a2 %a10 = fadd float %a3, %a4 %a11 = fadd float %a5, %a6 %a12 = fadd float %a7, %a8 %a13 = fadd float %a9, %a10 %a14 = fadd float %a11, %a12 %a15 = fadd float %a13, %a14 ret float %a15 } define float @scalarize_into_load_sext(i64 %22, ptr %23, ptr %rawA, ptr %rawB) { ; CHECK-LABEL: scalarize_into_load_sext: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldpsw x9, x8, [x1] ; CHECK-NEXT: ldpsw x11, x10, [x1, #8] ; CHECK-NEXT: ldpsw x13, x12, [x1, #24] ; CHECK-NEXT: ldr s0, [x2, x9, lsl #2] ; CHECK-NEXT: ldr s1, [x2, x8, lsl #2] ; CHECK-NEXT: ldpsw x9, x8, [x1, #56] ; CHECK-NEXT: ldr s2, [x2, x11, lsl #2] ; CHECK-NEXT: ldr s3, [x2, x10, lsl #2] ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ldpsw x11, x10, [x1, #48] ; CHECK-NEXT: ldpsw x15, x14, [x1, #16] ; CHECK-NEXT: ldpsw x17, x16, [x1, #40] ; CHECK-NEXT: ldpsw x0, x18, [x1, #32] ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: ldr s2, [x2, x15, lsl #2] ; CHECK-NEXT: ldr s3, [x2, x14, lsl #2] ; CHECK-NEXT: ldr s4, [x2, x13, lsl #2] ; CHECK-NEXT: ldr s5, [x2, x12, lsl #2] ; CHECK-NEXT: ldr s16, [x2, x17, lsl #2] ; CHECK-NEXT: ldr s6, [x2, x0, lsl #2] ; CHECK-NEXT: fadd s2, s2, s3 ; CHECK-NEXT: ldr s7, [x2, x18, lsl #2] ; CHECK-NEXT: ldr s17, [x2, x16, lsl #2] ; CHECK-NEXT: fadd s3, s4, s5 ; CHECK-NEXT: ldr s18, [x2, x11, lsl #2] ; CHECK-NEXT: ldr s19, [x2, x10, lsl #2] ; CHECK-NEXT: fadd s4, s6, s7 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ldr s20, [x2, x9, lsl #2] ; CHECK-NEXT: ldr s21, [x2, x8, lsl #2] ; CHECK-NEXT: fadd s5, s16, s17 ; CHECK-NEXT: fadd s6, s18, s19 ; CHECK-NEXT: fadd s7, s20, s21 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s2, s4, s5 ; CHECK-NEXT: fadd s3, s6, s7 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret entry: %wide.load = load <16 x i32>, ptr %23, align 4 %24 = sext <16 x i32> %wide.load to <16 x i64> %25 = extractelement <16 x i64> %24, i32 0 %26 = getelementptr inbounds float, ptr %rawA, i64 %25 %27 = extractelement <16 x i64> %24, i32 1 %28 = getelementptr inbounds float, ptr %rawA, i64 %27 %29 = extractelement <16 x i64> %24, i32 2 %30 = getelementptr inbounds float, ptr %rawA, i64 %29 %31 = extractelement <16 x i64> %24, i32 3 %32 = getelementptr inbounds float, ptr %rawA, i64 %31 %33 = extractelement <16 x i64> %24, i32 4 %34 = getelementptr inbounds float, ptr %rawA, i64 %33 %35 = extractelement <16 x i64> %24, i32 5 %36 = getelementptr inbounds float, ptr %rawA, i64 %35 %37 = extractelement <16 x i64> %24, i32 6 %38 = getelementptr inbounds float, ptr %rawA, i64 %37 %39 = extractelement <16 x i64> %24, i32 7 %40 = getelementptr inbounds float, ptr %rawA, i64 %39 %41 = extractelement <16 x i64> %24, i32 8 %42 = getelementptr inbounds float, ptr %rawA, i64 %41 %43 = extractelement <16 x i64> %24, i32 9 %44 = getelementptr inbounds float, ptr %rawA, i64 %43 %45 = extractelement <16 x i64> %24, i32 10 %46 = getelementptr inbounds float, ptr %rawA, i64 %45 %47 = extractelement <16 x i64> %24, i32 11 %48 = getelementptr inbounds float, ptr %rawA, i64 %47 %49 = extractelement <16 x i64> %24, i32 12 %50 = getelementptr inbounds float, ptr %rawA, i64 %49 %51 = extractelement <16 x i64> %24, i32 13 %52 = getelementptr inbounds float, ptr %rawA, i64 %51 %53 = extractelement <16 x i64> %24, i32 14 %54 = getelementptr inbounds float, ptr %rawA, i64 %53 %55 = extractelement <16 x i64> %24, i32 15 %56 = getelementptr inbounds float, ptr %rawA, i64 %55 %59 = load float, ptr %26, align 4 %60 = load float, ptr %28, align 4 %61 = load float, ptr %30, align 4 %62 = load float, ptr %32, align 4 %63 = load float, ptr %34, align 4 %64 = load float, ptr %36, align 4 %65 = load float, ptr %38, align 4 %66 = load float, ptr %40, align 4 %67 = load float, ptr %42, align 4 %68 = load float, ptr %44, align 4 %69 = load float, ptr %46, align 4 %70 = load float, ptr %48, align 4 %71 = load float, ptr %50, align 4 %72 = load float, ptr %52, align 4 %73 = load float, ptr %54, align 4 %74 = load float, ptr %56, align 4 %a1 = fadd float %59, %60 %a2 = fadd float %61, %62 %a3 = fadd float %63, %64 %a4 = fadd float %65, %66 %a5 = fadd float %67, %68 %a6 = fadd float %69, %70 %a7 = fadd float %71, %72 %a8 = fadd float %73, %74 %a9 = fadd float %a1, %a2 %a10 = fadd float %a3, %a4 %a11 = fadd float %a5, %a6 %a12 = fadd float %a7, %a8 %a13 = fadd float %a9, %a10 %a14 = fadd float %a11, %a12 %a15 = fadd float %a13, %a14 ret float %a15 } define float @scalarize_into_load_zext(i64 %22, ptr %23, ptr %rawA, ptr %rawB) { ; CHECK-LABEL: scalarize_into_load_zext: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp w9, w8, [x1] ; CHECK-NEXT: ldp w11, w10, [x1, #8] ; CHECK-NEXT: ldp w13, w12, [x1, #24] ; CHECK-NEXT: ldr s0, [x2, x9, lsl #2] ; CHECK-NEXT: ldr s1, [x2, x8, lsl #2] ; CHECK-NEXT: ldp w9, w8, [x1, #56] ; CHECK-NEXT: ldr s2, [x2, x11, lsl #2] ; CHECK-NEXT: ldr s3, [x2, x10, lsl #2] ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ldp w11, w10, [x1, #48] ; CHECK-NEXT: ldp w15, w14, [x1, #16] ; CHECK-NEXT: ldp w17, w16, [x1, #40] ; CHECK-NEXT: ldp w0, w18, [x1, #32] ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: ldr s2, [x2, x15, lsl #2] ; CHECK-NEXT: ldr s3, [x2, x14, lsl #2] ; CHECK-NEXT: ldr s4, [x2, x13, lsl #2] ; CHECK-NEXT: ldr s5, [x2, x12, lsl #2] ; CHECK-NEXT: ldr s16, [x2, x17, lsl #2] ; CHECK-NEXT: ldr s6, [x2, x0, lsl #2] ; CHECK-NEXT: fadd s2, s2, s3 ; CHECK-NEXT: ldr s7, [x2, x18, lsl #2] ; CHECK-NEXT: ldr s17, [x2, x16, lsl #2] ; CHECK-NEXT: fadd s3, s4, s5 ; CHECK-NEXT: ldr s18, [x2, x11, lsl #2] ; CHECK-NEXT: ldr s19, [x2, x10, lsl #2] ; CHECK-NEXT: fadd s4, s6, s7 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ldr s20, [x2, x9, lsl #2] ; CHECK-NEXT: ldr s21, [x2, x8, lsl #2] ; CHECK-NEXT: fadd s5, s16, s17 ; CHECK-NEXT: fadd s6, s18, s19 ; CHECK-NEXT: fadd s7, s20, s21 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s2, s4, s5 ; CHECK-NEXT: fadd s3, s6, s7 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret entry: %wide.load = load <16 x i32>, ptr %23, align 4 %24 = zext <16 x i32> %wide.load to <16 x i64> %25 = extractelement <16 x i64> %24, i32 0 %26 = getelementptr inbounds float, ptr %rawA, i64 %25 %27 = extractelement <16 x i64> %24, i32 1 %28 = getelementptr inbounds float, ptr %rawA, i64 %27 %29 = extractelement <16 x i64> %24, i32 2 %30 = getelementptr inbounds float, ptr %rawA, i64 %29 %31 = extractelement <16 x i64> %24, i32 3 %32 = getelementptr inbounds float, ptr %rawA, i64 %31 %33 = extractelement <16 x i64> %24, i32 4 %34 = getelementptr inbounds float, ptr %rawA, i64 %33 %35 = extractelement <16 x i64> %24, i32 5 %36 = getelementptr inbounds float, ptr %rawA, i64 %35 %37 = extractelement <16 x i64> %24, i32 6 %38 = getelementptr inbounds float, ptr %rawA, i64 %37 %39 = extractelement <16 x i64> %24, i32 7 %40 = getelementptr inbounds float, ptr %rawA, i64 %39 %41 = extractelement <16 x i64> %24, i32 8 %42 = getelementptr inbounds float, ptr %rawA, i64 %41 %43 = extractelement <16 x i64> %24, i32 9 %44 = getelementptr inbounds float, ptr %rawA, i64 %43 %45 = extractelement <16 x i64> %24, i32 10 %46 = getelementptr inbounds float, ptr %rawA, i64 %45 %47 = extractelement <16 x i64> %24, i32 11 %48 = getelementptr inbounds float, ptr %rawA, i64 %47 %49 = extractelement <16 x i64> %24, i32 12 %50 = getelementptr inbounds float, ptr %rawA, i64 %49 %51 = extractelement <16 x i64> %24, i32 13 %52 = getelementptr inbounds float, ptr %rawA, i64 %51 %53 = extractelement <16 x i64> %24, i32 14 %54 = getelementptr inbounds float, ptr %rawA, i64 %53 %55 = extractelement <16 x i64> %24, i32 15 %56 = getelementptr inbounds float, ptr %rawA, i64 %55 %59 = load float, ptr %26, align 4 %60 = load float, ptr %28, align 4 %61 = load float, ptr %30, align 4 %62 = load float, ptr %32, align 4 %63 = load float, ptr %34, align 4 %64 = load float, ptr %36, align 4 %65 = load float, ptr %38, align 4 %66 = load float, ptr %40, align 4 %67 = load float, ptr %42, align 4 %68 = load float, ptr %44, align 4 %69 = load float, ptr %46, align 4 %70 = load float, ptr %48, align 4 %71 = load float, ptr %50, align 4 %72 = load float, ptr %52, align 4 %73 = load float, ptr %54, align 4 %74 = load float, ptr %56, align 4 %a1 = fadd float %59, %60 %a2 = fadd float %61, %62 %a3 = fadd float %63, %64 %a4 = fadd float %65, %66 %a5 = fadd float %67, %68 %a6 = fadd float %69, %70 %a7 = fadd float %71, %72 %a8 = fadd float %73, %74 %a9 = fadd float %a1, %a2 %a10 = fadd float %a3, %a4 %a11 = fadd float %a5, %a6 %a12 = fadd float %a7, %a8 %a13 = fadd float %a9, %a10 %a14 = fadd float %a11, %a12 %a15 = fadd float %a13, %a14 ret float %a15 }