diff options
author | David Green <david.green@arm.com> | 2023-02-07 16:39:20 +0000 |
---|---|---|
committer | David Green <david.green@arm.com> | 2023-02-07 16:39:20 +0000 |
commit | 9bd58f6f051c9b5546bbaa891fa11d3624b8d9a4 (patch) | |
tree | 62e6b4043dc65c7973e250e49a4baa35723f5ee1 | |
parent | c651c0878ffb8a67d77d3749001fbf780f44d953 (diff) | |
download | llvm-9bd58f6f051c9b5546bbaa891fa11d3624b8d9a4.zip llvm-9bd58f6f051c9b5546bbaa891fa11d3624b8d9a4.tar.gz llvm-9bd58f6f051c9b5546bbaa891fa11d3624b8d9a4.tar.bz2 |
[ARM][AArch64][RISCV] Add tests for various double reductions. NFC
-rw-r--r-- | llvm/test/CodeGen/AArch64/double_reduct.ll | 307 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/sve-doublereduct.ll | 289 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/sve-fp-reduce.ll | 16 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/vecreduce-add.ll | 279 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/vecreduce-fadd.ll | 192 | ||||
-rw-r--r-- | llvm/test/CodeGen/RISCV/double_reduct.ll | 369 | ||||
-rw-r--r-- | llvm/test/CodeGen/Thumb2/mve-doublereduct.ll | 329 |
7 files changed, 1781 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/AArch64/double_reduct.ll b/llvm/test/CodeGen/AArch64/double_reduct.ll new file mode 100644 index 0000000..1fd1eb6 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/double_reduct.ll @@ -0,0 +1,307 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=aarch64-eabi < %s | FileCheck %s + +define float @add_f32(<8 x float> %a, <4 x float> %b) { +; CHECK-LABEL: add_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-NEXT: faddp v2.4s, v2.4s, v2.4s +; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s +; CHECK-NEXT: faddp s1, v2.2s +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ret + %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) + %r = fadd fast float %r1, %r2 + ret float %r +} + +define float @fmul_f32(<8 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fmul_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: fmul v1.2s, v2.2s, v3.2s +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: fmul s1, s1, v1.s[1] +; CHECK-NEXT: fmul v0.2s, v0.2s, v2.2s +; CHECK-NEXT: fmul s0, s0, v0.s[1] +; CHECK-NEXT: fmul s0, s0, s1 +; CHECK-NEXT: ret + %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b) + %r = fmul fast float %r1, %r2 + ret float %r +} + +define float @fmin_f32(<8 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fmin_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fminnm v0.4s, v0.4s, v1.4s +; CHECK-NEXT: fminnmv s2, v2.4s +; CHECK-NEXT: fminnmv s0, v0.4s +; CHECK-NEXT: fminnm s0, s0, s2 +; CHECK-NEXT: ret + %r1 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b) + %r = call float @llvm.minnum.f32(float %r1, float %r2) + ret float %r +} + +define float @fmax_f32(<8 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fmax_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s +; CHECK-NEXT: fmaxnmv s2, v2.4s +; CHECK-NEXT: fmaxnmv s0, v0.4s +; CHECK-NEXT: fmaxnm s0, s0, s2 +; CHECK-NEXT: ret + %r1 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b) + %r = call float @llvm.maxnum.f32(float %r1, float %r2) + ret float %r +} + + +define i32 @add_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: add_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %b) + %r = add i32 %r1, %r2 + ret i32 %r +} + +define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: add_ext_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uaddlv h0, v0.16b +; CHECK-NEXT: uaddlv h1, v1.16b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ret + %ae = zext <16 x i8> %a to <16 x i16> + %be = zext <16 x i8> %b to <16 x i16> + %r1 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %ae) + %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be) + %r = add i16 %r1, %r2 + ret i16 %r +} + +define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: add_ext_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uaddl2 v3.8h, v0.16b, v1.16b +; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: uaddlv h2, v2.16b +; CHECK-NEXT: add v0.8h, v0.8h, v3.8h +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ret + %ae = zext <32 x i8> %a to <32 x i16> + %be = zext <16 x i8> %b to <16 x i16> + %r1 = call i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16> %ae) + %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be) + %r = add i16 %r1, %r2 + ret i16 %r +} + +define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: mul_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mul v1.2s, v2.2s, v3.2s +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov w9, v1.s[1] +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: mul v0.2s, v0.2s, v2.2s +; CHECK-NEXT: mul w9, w11, w9 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: mul w8, w10, w8 +; CHECK-NEXT: mul w0, w8, w9 +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b) + %r = mul i32 %r1, %r2 + ret i32 %r +} + +define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: and_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: and v2.8b, v2.8b, v3.8b +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: mov w8, v2.s[1] +; CHECK-NEXT: mov w9, v0.s[1] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: and w9, w10, w9 +; CHECK-NEXT: and w8, w11, w8 +; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b) + %r = and i32 %r1, %r2 + ret i32 %r +} + +define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: or_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: orr v2.8b, v2.8b, v3.8b +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: mov w8, v2.s[1] +; CHECK-NEXT: mov w9, v0.s[1] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: orr w9, w10, w9 +; CHECK-NEXT: orr w8, w11, w8 +; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b) + %r = or i32 %r1, %r2 + ret i32 %r +} + +define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: xor_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: eor v2.8b, v2.8b, v3.8b +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: mov w8, v2.s[1] +; CHECK-NEXT: mov w9, v0.s[1] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: eor w9, w10, w9 +; CHECK-NEXT: eor w8, w11, w8 +; CHECK-NEXT: eor w0, w9, w8 +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b) + %r = xor i32 %r1, %r2 + ret i32 %r +} + +define i32 @umin_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: umin_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: uminv s2, v2.4s +; CHECK-NEXT: uminv s0, v0.4s +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: cmp w9, w8 +; CHECK-NEXT: csel w0, w9, w8, lo +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b) + %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2) + ret i32 %r +} + +define i32 @umax_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: umax_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umaxv s2, v2.4s +; CHECK-NEXT: umaxv s0, v0.4s +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: cmp w9, w8 +; CHECK-NEXT: csel w0, w9, w8, hi +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b) + %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2) + ret i32 %r +} + +define i32 @smin_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: smin_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: sminv s2, v2.4s +; CHECK-NEXT: sminv s0, v0.4s +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: cmp w9, w8 +; CHECK-NEXT: csel w0, w9, w8, lt +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b) + %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2) + ret i32 %r +} + +define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: smax_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smaxv s2, v2.4s +; CHECK-NEXT: smaxv s0, v0.4s +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: cmp w9, w8 +; CHECK-NEXT: csel w0, w9, w8, gt +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b) + %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2) + ret i32 %r +} + +declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) +declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>) +declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>) +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>) +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) +declare i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>) +declare i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16>) +declare i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32>) +declare float @llvm.minnum.f32(float, float) +declare float @llvm.maxnum.f32(float, float) +declare i32 @llvm.umin.i32(i32, i32) +declare i32 @llvm.umax.i32(i32, i32) +declare i32 @llvm.smin.i32(i32, i32) +declare i32 @llvm.smax.i32(i32, i32) diff --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll new file mode 100644 index 0000000..c79c87b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll @@ -0,0 +1,289 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=aarch64-eabi -mattr=+sve2 < %s | FileCheck %s + +define float @add_f32(<vscale x 8 x float> %a, <vscale x 4 x float> %b) { +; CHECK-LABEL: add_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fadd z0.s, z0.s, z1.s +; CHECK-NEXT: faddv s2, p0, z2.s +; CHECK-NEXT: faddv s0, p0, z0.s +; CHECK-NEXT: fadd s0, s0, s2 +; CHECK-NEXT: ret + %r1 = call fast float @llvm.vector.reduce.fadd.f32.nxv8f32(float -0.0, <vscale x 8 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fadd.f32.nxv4f32(float -0.0, <vscale x 4 x float> %b) + %r = fadd fast float %r1, %r2 + ret float %r +} + +;define float @fmul_f32(<vscale x 8 x float> %a, <vscale x 4 x float> %b) { +; %r1 = call fast float @llvm.vector.reduce.fmul.f32.nxv8f32(float 1.0, <vscale x 8 x float> %a) +; %r2 = call fast float @llvm.vector.reduce.fmul.f32.nxv4f32(float 1.0, <vscale x 4 x float> %b) +; %r = fmul fast float %r1, %r2 +; ret float %r +;} + +define float @fmin_f32(<vscale x 8 x float> %a, <vscale x 4 x float> %b) { +; CHECK-LABEL: fmin_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: fminnmv s2, p0, z2.s +; CHECK-NEXT: fminnmv s0, p0, z0.s +; CHECK-NEXT: fminnm s0, s0, s2 +; CHECK-NEXT: ret + %r1 = call fast float @llvm.vector.reduce.fmin.nxv8f32(<vscale x 8 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %b) + %r = call float @llvm.minnum.f32(float %r1, float %r2) + ret float %r +} + +define float @fmax_f32(<vscale x 8 x float> %a, <vscale x 4 x float> %b) { +; CHECK-LABEL: fmax_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: fmaxnmv s2, p0, z2.s +; CHECK-NEXT: fmaxnmv s0, p0, z0.s +; CHECK-NEXT: fmaxnm s0, s0, s2 +; CHECK-NEXT: ret + %r1 = call fast float @llvm.vector.reduce.fmax.nxv8f32(<vscale x 8 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %b) + %r = call float @llvm.maxnum.f32(float %r1, float %r2) + ret float %r +} + + +define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: add_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: uaddv d2, p0, z2.s +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.add.i32.nxv8i32(<vscale x 8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.add.i32.nxv4i32(<vscale x 4 x i32> %b) + %r = add i32 %r1, %r2 + ret i32 %r +} + +define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { +; CHECK-LABEL: add_ext_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z2.h, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpkhi z3.h, z1.b +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add z0.h, z0.h, z2.h +; CHECK-NEXT: add z1.h, z1.h, z3.h +; CHECK-NEXT: uaddv d0, p0, z0.h +; CHECK-NEXT: uaddv d1, p0, z1.h +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ret + %ae = zext <vscale x 16 x i8> %a to <vscale x 16 x i16> + %be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16> + %r1 = call i16 @llvm.vector.reduce.add.i16.nxv16i16(<vscale x 16 x i16> %ae) + %r2 = call i16 @llvm.vector.reduce.add.i16.nxv16i16(<vscale x 16 x i16> %be) + %r = add i16 %r1, %r2 + ret i16 %r +} + +define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) { +; CHECK-LABEL: add_ext_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z3.h, z1.b +; CHECK-NEXT: uunpklo z4.h, z0.b +; CHECK-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEXT: uunpkhi z0.h, z0.b +; CHECK-NEXT: uunpkhi z5.h, z2.b +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: add z1.h, z4.h, z3.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add z0.h, z1.h, z0.h +; CHECK-NEXT: add z1.h, z2.h, z5.h +; CHECK-NEXT: uaddv d0, p0, z0.h +; CHECK-NEXT: uaddv d1, p0, z1.h +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ret + %ae = zext <vscale x 32 x i8> %a to <vscale x 32 x i16> + %be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16> + %r1 = call i16 @llvm.vector.reduce.add.i16.nxv32i16(<vscale x 32 x i16> %ae) + %r2 = call i16 @llvm.vector.reduce.add.i16.nxv16i16(<vscale x 16 x i16> %be) + %r = add i16 %r1, %r2 + ret i16 %r +} + +;define i32 @mul_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) { +; %r1 = call i32 @llvm.vector.reduce.mul.i32.nxv8i32(<vscale x 8 x i32> %a) +; %r2 = call i32 @llvm.vector.reduce.mul.i32.nxv4i32(<vscale x 4 x i32> %b) +; %r = mul i32 %r1, %r2 +; ret i32 %r +;} + +define i32 @and_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: and_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: andv s2, p0, z2.s +; CHECK-NEXT: andv s0, p0, z0.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: and w0, w8, w9 +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.and.i32.nxv8i32(<vscale x 8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.and.i32.nxv4i32(<vscale x 4 x i32> %b) + %r = and i32 %r1, %r2 + ret i32 %r +} + +define i32 @or_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: or_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: orv s2, p0, z2.s +; CHECK-NEXT: orv s0, p0, z0.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.or.i32.nxv8i32(<vscale x 8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.or.i32.nxv4i32(<vscale x 4 x i32> %b) + %r = or i32 %r1, %r2 + ret i32 %r +} + +define i32 @xor_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: xor_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: eorv s2, p0, z2.s +; CHECK-NEXT: eorv s0, p0, z0.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: eor w0, w8, w9 +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.xor.i32.nxv8i32(<vscale x 8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.xor.i32.nxv4i32(<vscale x 4 x i32> %b) + %r = xor i32 %r1, %r2 + ret i32 %r +} + +define i32 @umin_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: umin_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uminv s2, p0, z2.s +; CHECK-NEXT: uminv s0, p0, z0.s +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: csel w0, w8, w9, lo +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.umin.i32.nxv8i32(<vscale x 8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.umin.i32.nxv4i32(<vscale x 4 x i32> %b) + %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2) + ret i32 %r +} + +define i32 @umax_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: umax_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: umaxv s2, p0, z2.s +; CHECK-NEXT: umaxv s0, p0, z0.s +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: csel w0, w8, w9, hi +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.umax.i32.nxv8i32(<vscale x 8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.umax.i32.nxv4i32(<vscale x 4 x i32> %b) + %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2) + ret i32 %r +} + +define i32 @smin_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: smin_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: sminv s2, p0, z2.s +; CHECK-NEXT: sminv s0, p0, z0.s +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: csel w0, w8, w9, lt +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.smin.i32.nxv8i32(<vscale x 8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.smin.i32.nxv4i32(<vscale x 4 x i32> %b) + %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2) + ret i32 %r +} + +define i32 @smax_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: smax_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: smaxv s2, p0, z2.s +; CHECK-NEXT: smaxv s0, p0, z0.s +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: csel w0, w8, w9, gt +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.smax.i32.nxv8i32(<vscale x 8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.smax.i32.nxv4i32(<vscale x 4 x i32> %b) + %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2) + ret i32 %r +} + +declare float @llvm.vector.reduce.fadd.f32.nxv8f32(float, <vscale x 8 x float>) +declare float @llvm.vector.reduce.fadd.f32.nxv4f32(float, <vscale x 4 x float>) +declare float @llvm.vector.reduce.fmul.f32.nxv8f32(float, <vscale x 8 x float>) +declare float @llvm.vector.reduce.fmul.f32.nxv4f32(float, <vscale x 4 x float>) +declare float @llvm.vector.reduce.fmin.nxv8f32(<vscale x 8 x float>) +declare float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float>) +declare float @llvm.vector.reduce.fmax.nxv8f32(<vscale x 8 x float>) +declare float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float>) +declare i32 @llvm.vector.reduce.add.i32.nxv8i32(<vscale x 8 x i32>) +declare i32 @llvm.vector.reduce.add.i32.nxv4i32(<vscale x 4 x i32>) +declare i16 @llvm.vector.reduce.add.i16.nxv32i16(<vscale x 32 x i16>) +declare i16 @llvm.vector.reduce.add.i16.nxv16i16(<vscale x 16 x i16>) +declare i32 @llvm.vector.reduce.mul.i32.nxv8i32(<vscale x 8 x i32>) +declare i32 @llvm.vector.reduce.mul.i32.nxv4i32(<vscale x 4 x i32>) +declare i32 @llvm.vector.reduce.and.i32.nxv8i32(<vscale x 8 x i32>) +declare i32 @llvm.vector.reduce.and.i32.nxv4i32(<vscale x 4 x i32>) +declare i32 @llvm.vector.reduce.or.i32.nxv8i32(<vscale x 8 x i32>) +declare i32 @llvm.vector.reduce.or.i32.nxv4i32(<vscale x 4 x i32>) +declare i32 @llvm.vector.reduce.xor.i32.nxv8i32(<vscale x 8 x i32>) +declare i32 @llvm.vector.reduce.xor.i32.nxv4i32(<vscale x 4 x i32>) +declare i32 @llvm.vector.reduce.umin.i32.nxv8i32(<vscale x 8 x i32>) +declare i32 @llvm.vector.reduce.umin.i32.nxv4i32(<vscale x 4 x i32>) +declare i32 @llvm.vector.reduce.umax.i32.nxv8i32(<vscale x 8 x i32>) +declare i32 @llvm.vector.reduce.umax.i32.nxv4i32(<vscale x 4 x i32>) +declare i32 @llvm.vector.reduce.smin.i32.nxv8i32(<vscale x 8 x i32>) +declare i32 @llvm.vector.reduce.smin.i32.nxv4i32(<vscale x 4 x i32>) +declare i32 @llvm.vector.reduce.smax.i32.nxv8i32(<vscale x 8 x i32>) +declare i32 @llvm.vector.reduce.smax.i32.nxv4i32(<vscale x 4 x i32>) +declare float @llvm.minnum.f32(float, float) +declare float @llvm.maxnum.f32(float, float) +declare i32 @llvm.umin.i32(i32, i32) +declare i32 @llvm.umax.i32(i32, i32) +declare i32 @llvm.smin.i32(i32, i32) +declare i32 @llvm.smax.i32(i32, i32) diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll index 6c41f62..0106dc2 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll @@ -354,6 +354,21 @@ define double @fminv_nxv2f64(<vscale x 2 x double> %a) { ret double %res } +define float @fadd_reduct_reassoc_v4v8f32(<vscale x 4 x float> %a, <vscale x 8 x float> %b) { +; CHECK-LABEL: fadd_reduct_reassoc_v4v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fadd z1.s, z1.s, z2.s +; CHECK-NEXT: faddv s0, p0, z0.s +; CHECK-NEXT: faddv s1, p0, z1.s +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ret + %r1 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.0, <vscale x 4 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.0, <vscale x 8 x float> %b) + %r = fadd fast float %r1, %r2 + ret float %r +} + declare half @llvm.vector.reduce.fadd.nxv2f16(half, <vscale x 2 x half>) declare half @llvm.vector.reduce.fadd.nxv4f16(half, <vscale x 4 x half>) declare half @llvm.vector.reduce.fadd.nxv8f16(half, <vscale x 8 x half>) @@ -362,6 +377,7 @@ declare half @llvm.vector.reduce.fadd.nxv10f16(half, <vscale x 10 x half>) declare half @llvm.vector.reduce.fadd.nxv12f16(half, <vscale x 12 x half>) declare float @llvm.vector.reduce.fadd.nxv2f32(float, <vscale x 2 x float>) declare float @llvm.vector.reduce.fadd.nxv4f32(float, <vscale x 4 x float>) +declare float @llvm.vector.reduce.fadd.nxv8f32(float, <vscale x 8 x float>) declare double @llvm.vector.reduce.fadd.nxv2f64(double, <vscale x 2 x double>) declare half @llvm.vector.reduce.fmax.nxv2f16(<vscale x 2 x half>) diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 82f45e5..8316c88 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -1992,6 +1992,91 @@ entry: ret i64 %z } +define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8 x i8> %bx, <8 x i8> %by) { +; CHECK-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext: +; CHECK-BASE: // %bb.0: // %entry +; CHECK-BASE-NEXT: sshll v2.8h, v2.8b, #0 +; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BASE-NEXT: saddlp v2.4s, v2.8h +; CHECK-BASE-NEXT: uaddlp v0.4s, v0.8h +; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-BASE-NEXT: sshll v3.8h, v3.8b, #0 +; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h +; CHECK-BASE-NEXT: sadalp v2.4s, v3.8h +; CHECK-BASE-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-BASE-NEXT: addv s0, v0.4s +; CHECK-BASE-NEXT: fmov w0, s0 +; CHECK-BASE-NEXT: ret +; +; CHECK-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext: +; CHECK-DOT: // %bb.0: // %entry +; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 +; CHECK-DOT-NEXT: movi v5.8b, #1 +; CHECK-DOT-NEXT: movi v6.2d, #0000000000000000 +; CHECK-DOT-NEXT: sdot v4.2s, v3.8b, v5.8b +; CHECK-DOT-NEXT: udot v6.2s, v1.8b, v5.8b +; CHECK-DOT-NEXT: sdot v4.2s, v2.8b, v5.8b +; CHECK-DOT-NEXT: udot v6.2s, v0.8b, v5.8b +; CHECK-DOT-NEXT: add v0.2s, v6.2s, v4.2s +; CHECK-DOT-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-DOT-NEXT: fmov w0, s0 +; CHECK-DOT-NEXT: ret +entry: + %axx = zext <8 x i8> %ax to <8 x i32> + %az1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %axx) + %ayy = zext <8 x i8> %ay to <8 x i32> + %az2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %ayy) + %az = add i32 %az1, %az2 + %bxx = sext <8 x i8> %bx to <8 x i32> + %bz1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %bxx) + %byy = sext <8 x i8> %by to <8 x i32> + %bz2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %byy) + %bz = add i32 %bz1, %bz2 + %z = add i32 %az, %bz + ret i32 %z +} + +define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i16> %ay, <8 x i16> %bx, <8 x i16> %by) { +; CHECK-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ushll v5.4s, v0.4h, #0 +; CHECK-NEXT: ushll v4.4s, v2.4h, #0 +; CHECK-NEXT: ushll v6.4s, v1.4h, #0 +; CHECK-NEXT: uaddw2 v0.4s, v5.4s, v0.8h +; CHECK-NEXT: ushll v5.4s, v3.4h, #0 +; CHECK-NEXT: uaddw2 v1.4s, v6.4s, v1.8h +; CHECK-NEXT: uaddw2 v2.4s, v4.4s, v2.8h +; CHECK-NEXT: uaddw2 v3.4s, v5.4s, v3.8h +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v1.4s, v2.4s, v3.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %axx = zext <8 x i16> %ax to <8 x i32> + %s1h = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %s1l = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %axs = add <4 x i32> %s1h, %s1l + %ayy = zext <8 x i16> %ay to <8 x i32> + %s2h = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %s2l = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %ays = add <4 x i32> %s2h, %s2l + %az = add <4 x i32> %axs, %ays + %bxx = zext <8 x i16> %bx to <8 x i32> + %s3h = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %s3l = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %bxs = add <4 x i32> %s3h, %s3l + %byy = zext <8 x i16> %by to <8 x i32> + %s4h = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %s4l = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %bys = add <4 x i32> %s4h, %s4l + %bz = add <4 x i32> %bxs, %bys + %z = add <4 x i32> %az, %bz + %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z) + ret i32 %z2 +} + define i64 @add_pair_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: add_pair_v2i64_v2i64: ; CHECK: // %bb.0: // %entry @@ -2006,6 +2091,200 @@ entry: ret i64 %z } +define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) { +; CHECK-BASE-LABEL: full: +; CHECK-BASE: // %bb.0: // %entry +; CHECK-BASE-NEXT: // kill: def $w3 killed $w3 def $x3 +; CHECK-BASE-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-BASE-NEXT: sxtw x8, w1 +; CHECK-BASE-NEXT: sxtw x9, w3 +; CHECK-BASE-NEXT: add x10, x0, x8 +; CHECK-BASE-NEXT: add x11, x2, x9 +; CHECK-BASE-NEXT: ldr d2, [x0] +; CHECK-BASE-NEXT: ldr d3, [x2] +; CHECK-BASE-NEXT: ldr d0, [x10] +; CHECK-BASE-NEXT: add x10, x10, x8 +; CHECK-BASE-NEXT: ldr d1, [x11] +; CHECK-BASE-NEXT: add x11, x11, x9 +; CHECK-BASE-NEXT: uabdl v0.8h, v0.8b, v1.8b +; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b +; CHECK-BASE-NEXT: ldr d2, [x10] +; CHECK-BASE-NEXT: ldr d3, [x11] +; CHECK-BASE-NEXT: add x10, x10, x8 +; CHECK-BASE-NEXT: uaddlp v0.4s, v0.8h +; CHECK-BASE-NEXT: add x11, x11, x9 +; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h +; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b +; CHECK-BASE-NEXT: ldr d2, [x10] +; CHECK-BASE-NEXT: ldr d3, [x11] +; CHECK-BASE-NEXT: add x10, x10, x8 +; CHECK-BASE-NEXT: add x11, x11, x9 +; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h +; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b +; CHECK-BASE-NEXT: ldr d2, [x10] +; CHECK-BASE-NEXT: ldr d3, [x11] +; CHECK-BASE-NEXT: add x10, x10, x8 +; CHECK-BASE-NEXT: add x11, x11, x9 +; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h +; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b +; CHECK-BASE-NEXT: ldr d2, [x10] +; CHECK-BASE-NEXT: ldr d3, [x11] +; CHECK-BASE-NEXT: add x10, x10, x8 +; CHECK-BASE-NEXT: add x11, x11, x9 +; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h +; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b +; CHECK-BASE-NEXT: ldr d2, [x10] +; CHECK-BASE-NEXT: ldr d3, [x11] +; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h +; CHECK-BASE-NEXT: ldr d1, [x10, x8] +; CHECK-BASE-NEXT: uabdl v2.8h, v2.8b, v3.8b +; CHECK-BASE-NEXT: ldr d3, [x11, x9] +; CHECK-BASE-NEXT: uadalp v0.4s, v2.8h +; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v3.8b +; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h +; CHECK-BASE-NEXT: addv s0, v0.4s +; CHECK-BASE-NEXT: fmov w0, s0 +; CHECK-BASE-NEXT: ret +; +; CHECK-DOT-LABEL: full: +; CHECK-DOT: // %bb.0: // %entry +; CHECK-DOT-NEXT: // kill: def $w3 killed $w3 def $x3 +; CHECK-DOT-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-DOT-NEXT: sxtw x8, w3 +; CHECK-DOT-NEXT: sxtw x9, w1 +; CHECK-DOT-NEXT: ldr d0, [x0] +; CHECK-DOT-NEXT: add x10, x0, x9 +; CHECK-DOT-NEXT: ldr d1, [x2] +; CHECK-DOT-NEXT: add x11, x2, x8 +; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000 +; CHECK-DOT-NEXT: movi v3.8b, #1 +; CHECK-DOT-NEXT: uabd v0.8b, v0.8b, v1.8b +; CHECK-DOT-NEXT: ldr d1, [x10] +; CHECK-DOT-NEXT: ldr d4, [x11] +; CHECK-DOT-NEXT: add x10, x10, x9 +; CHECK-DOT-NEXT: add x11, x11, x8 +; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b +; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b +; CHECK-DOT-NEXT: ldr d1, [x10] +; CHECK-DOT-NEXT: ldr d4, [x11] +; CHECK-DOT-NEXT: add x10, x10, x9 +; CHECK-DOT-NEXT: add x11, x11, x8 +; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b +; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b +; CHECK-DOT-NEXT: ldr d1, [x10] +; CHECK-DOT-NEXT: ldr d4, [x11] +; CHECK-DOT-NEXT: add x10, x10, x9 +; CHECK-DOT-NEXT: add x11, x11, x8 +; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b +; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b +; CHECK-DOT-NEXT: ldr d1, [x10] +; CHECK-DOT-NEXT: ldr d4, [x11] +; CHECK-DOT-NEXT: add x10, x10, x9 +; CHECK-DOT-NEXT: add x11, x11, x8 +; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b +; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b +; CHECK-DOT-NEXT: ldr d1, [x10] +; CHECK-DOT-NEXT: ldr d4, [x11] +; CHECK-DOT-NEXT: add x10, x10, x9 +; CHECK-DOT-NEXT: add x11, x11, x8 +; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b +; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b +; CHECK-DOT-NEXT: ldr d1, [x10] +; CHECK-DOT-NEXT: ldr d4, [x11] +; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b +; CHECK-DOT-NEXT: ldr d0, [x10, x9] +; CHECK-DOT-NEXT: uabd v1.8b, v1.8b, v4.8b +; CHECK-DOT-NEXT: ldr d4, [x11, x8] +; CHECK-DOT-NEXT: udot v2.2s, v1.8b, v3.8b +; CHECK-DOT-NEXT: uabd v0.8b, v0.8b, v4.8b +; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b +; CHECK-DOT-NEXT: addp v0.2s, v2.2s, v2.2s +; CHECK-DOT-NEXT: fmov w0, s0 +; CHECK-DOT-NEXT: ret +entry: + %idx.ext8 = sext i32 %s2 to i64 + %idx.ext = sext i32 %s1 to i64 + %0 = load <8 x i8>, ptr %p1, align 1 + %1 = zext <8 x i8> %0 to <8 x i32> + %2 = load <8 x i8>, ptr %p2, align 1 + %3 = zext <8 x i8> %2 to <8 x i32> + %4 = sub nsw <8 x i32> %1, %3 + %5 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %4, i1 true) + %6 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5) + %add.ptr = getelementptr inbounds i8, ptr %p1, i64 %idx.ext + %add.ptr9 = getelementptr inbounds i8, ptr %p2, i64 %idx.ext8 + %7 = load <8 x i8>, ptr %add.ptr, align 1 + %8 = zext <8 x i8> %7 to <8 x i32> + %9 = load <8 x i8>, ptr %add.ptr9, align 1 + %10 = zext <8 x i8> %9 to <8 x i32> + %11 = sub nsw <8 x i32> %8, %10 + %12 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %11, i1 true) + %13 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %12) + %op.rdx.1 = add i32 %13, %6 + %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext + %add.ptr9.1 = getelementptr inbounds i8, ptr %add.ptr9, i64 %idx.ext8 + %14 = load <8 x i8>, ptr %add.ptr.1, align 1 + %15 = zext <8 x i8> %14 to <8 x i32> + %16 = load <8 x i8>, ptr %add.ptr9.1, align 1 + %17 = zext <8 x i8> %16 to <8 x i32> + %18 = sub nsw <8 x i32> %15, %17 + %19 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %18, i1 true) + %20 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %19) + %op.rdx.2 = add i32 %20, %op.rdx.1 + %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext + %add.ptr9.2 = getelementptr inbounds i8, ptr %add.ptr9.1, i64 %idx.ext8 + %21 = load <8 x i8>, ptr %add.ptr.2, align 1 + %22 = zext <8 x i8> %21 to <8 x i32> + %23 = load <8 x i8>, ptr %add.ptr9.2, align 1 + %24 = zext <8 x i8> %23 to <8 x i32> + %25 = sub nsw <8 x i32> %22, %24 + %26 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %25, i1 true) + %27 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %26) + %op.rdx.3 = add i32 %27, %op.rdx.2 + %add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %idx.ext + %add.ptr9.3 = getelementptr inbounds i8, ptr %add.ptr9.2, i64 %idx.ext8 + %28 = load <8 x i8>, ptr %add.ptr.3, align 1 + %29 = zext <8 x i8> %28 to <8 x i32> + %30 = load <8 x i8>, ptr %add.ptr9.3, align 1 + %31 = zext <8 x i8> %30 to <8 x i32> + %32 = sub nsw <8 x i32> %29, %31 + %33 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %32, i1 true) + %34 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %33) + %op.rdx.4 = add i32 %34, %op.rdx.3 + %add.ptr.4 = getelementptr inbounds i8, ptr %add.ptr.3, i64 %idx.ext + %add.ptr9.4 = getelementptr inbounds i8, ptr %add.ptr9.3, i64 %idx.ext8 + %35 = load <8 x i8>, ptr %add.ptr.4, align 1 + %36 = zext <8 x i8> %35 to <8 x i32> + %37 = load <8 x i8>, ptr %add.ptr9.4, align 1 + %38 = zext <8 x i8> %37 to <8 x i32> + %39 = sub nsw <8 x i32> %36, %38 + %40 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %39, i1 true) + %41 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %40) + %op.rdx.5 = add i32 %41, %op.rdx.4 + %add.ptr.5 = getelementptr inbounds i8, ptr %add.ptr.4, i64 %idx.ext + %add.ptr9.5 = getelementptr inbounds i8, ptr %add.ptr9.4, i64 %idx.ext8 + %42 = load <8 x i8>, ptr %add.ptr.5, align 1 + %43 = zext <8 x i8> %42 to <8 x i32> + %44 = load <8 x i8>, ptr %add.ptr9.5, align 1 + %45 = zext <8 x i8> %44 to <8 x i32> + %46 = sub nsw <8 x i32> %43, %45 + %47 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %46, i1 true) + %48 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %47) + %op.rdx.6 = add i32 %48, %op.rdx.5 + %add.ptr.6 = getelementptr inbounds i8, ptr %add.ptr.5, i64 %idx.ext + %add.ptr9.6 = getelementptr inbounds i8, ptr %add.ptr9.5, i64 %idx.ext8 + %49 = load <8 x i8>, ptr %add.ptr.6, align 1 + %50 = zext <8 x i8> %49 to <8 x i32> + %51 = load <8 x i8>, ptr %add.ptr9.6, align 1 + %52 = zext <8 x i8> %51 to <8 x i32> + %53 = sub nsw <8 x i32> %50, %52 + %54 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %53, i1 true) + %55 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %54) + %op.rdx.7 = add i32 %55, %op.rdx.6 + ret i32 %op.rdx.7 +} + +declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1 immarg) #1 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll index 83689f2..452fc36 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll @@ -434,6 +434,198 @@ exit: ret half %red.next } + +define half @fadd_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) { +; FULLFP16-LABEL: fadd_reduct_reassoc_v8f16: +; FULLFP16: // %bb.0: +; FULLFP16-NEXT: faddp v2.8h, v0.8h, v0.8h +; FULLFP16-NEXT: faddp v3.8h, v1.8h, v1.8h +; FULLFP16-NEXT: faddp v0.8h, v2.8h, v0.8h +; FULLFP16-NEXT: faddp v1.8h, v3.8h, v1.8h +; FULLFP16-NEXT: faddp h0, v0.2h +; FULLFP16-NEXT: faddp h1, v1.2h +; FULLFP16-NEXT: fadd h0, h0, h1 +; FULLFP16-NEXT: ret +; +; CHECKNOFP16-LABEL: fadd_reduct_reassoc_v8f16: +; CHECKNOFP16: // %bb.0: +; CHECKNOFP16-NEXT: mov h2, v0.h[1] +; CHECKNOFP16-NEXT: mov h3, v1.h[1] +; CHECKNOFP16-NEXT: fcvt s4, h0 +; CHECKNOFP16-NEXT: fcvt s5, h1 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s3, h3 +; CHECKNOFP16-NEXT: fadd s2, s4, s2 +; CHECKNOFP16-NEXT: fadd s3, s5, s3 +; CHECKNOFP16-NEXT: mov h4, v0.h[2] +; CHECKNOFP16-NEXT: mov h5, v1.h[2] +; CHECKNOFP16-NEXT: fcvt h2, s2 +; CHECKNOFP16-NEXT: fcvt h3, s3 +; CHECKNOFP16-NEXT: fcvt s4, h4 +; CHECKNOFP16-NEXT: fcvt s5, h5 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s3, h3 +; CHECKNOFP16-NEXT: fadd s2, s2, s4 +; CHECKNOFP16-NEXT: fadd s3, s3, s5 +; CHECKNOFP16-NEXT: mov h4, v0.h[3] +; CHECKNOFP16-NEXT: mov h5, v1.h[3] +; CHECKNOFP16-NEXT: fcvt h2, s2 +; CHECKNOFP16-NEXT: fcvt h3, s3 +; CHECKNOFP16-NEXT: fcvt s4, h4 +; CHECKNOFP16-NEXT: fcvt s5, h5 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s3, h3 +; CHECKNOFP16-NEXT: fadd s2, s2, s4 +; CHECKNOFP16-NEXT: fadd s3, s3, s5 +; CHECKNOFP16-NEXT: mov h4, v0.h[4] +; CHECKNOFP16-NEXT: mov h5, v1.h[4] +; CHECKNOFP16-NEXT: fcvt h2, s2 +; CHECKNOFP16-NEXT: fcvt h3, s3 +; CHECKNOFP16-NEXT: fcvt s4, h4 +; CHECKNOFP16-NEXT: fcvt s5, h5 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s3, h3 +; CHECKNOFP16-NEXT: fadd s2, s2, s4 +; CHECKNOFP16-NEXT: fadd s3, s3, s5 +; CHECKNOFP16-NEXT: mov h4, v0.h[5] +; CHECKNOFP16-NEXT: mov h5, v1.h[5] +; CHECKNOFP16-NEXT: fcvt h2, s2 +; CHECKNOFP16-NEXT: fcvt h3, s3 +; CHECKNOFP16-NEXT: fcvt s4, h4 +; CHECKNOFP16-NEXT: fcvt s5, h5 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s3, h3 +; CHECKNOFP16-NEXT: fadd s2, s2, s4 +; CHECKNOFP16-NEXT: fadd s3, s3, s5 +; CHECKNOFP16-NEXT: mov h4, v0.h[6] +; CHECKNOFP16-NEXT: mov h5, v1.h[6] +; CHECKNOFP16-NEXT: mov h0, v0.h[7] +; CHECKNOFP16-NEXT: mov h1, v1.h[7] +; CHECKNOFP16-NEXT: fcvt h2, s2 +; CHECKNOFP16-NEXT: fcvt h3, s3 +; CHECKNOFP16-NEXT: fcvt s4, h4 +; CHECKNOFP16-NEXT: fcvt s5, h5 +; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s3, h3 +; CHECKNOFP16-NEXT: fadd s2, s2, s4 +; CHECKNOFP16-NEXT: fadd s3, s3, s5 +; CHECKNOFP16-NEXT: fcvt h2, s2 +; CHECKNOFP16-NEXT: fcvt h3, s3 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s3, h3 +; CHECKNOFP16-NEXT: fadd s0, s2, s0 +; CHECKNOFP16-NEXT: fadd s1, s3, s1 +; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: fcvt h1, s1 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fadd s0, s0, s1 +; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: ret + %r1 = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %a) + %r2 = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %b) + %r = fadd fast half %r1, %r2 + ret half %r +} + +define float @fadd_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) { +; CHECK-LABEL: fadd_reduct_reassoc_v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fadd v2.4s, v2.4s, v3.4s +; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-NEXT: faddp v1.4s, v2.4s, v2.4s +; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s +; CHECK-NEXT: faddp s1, v1.2s +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ret + %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %b) + %r = fadd fast float %r1, %r2 + ret float %r +} + +define float @fadd_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fadd_reduct_reassoc_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s +; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: faddp s1, v1.2s +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ret + %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) + %r = fadd fast float %r1, %r2 + ret float %r +} + +define float @fadd_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fadd_reduct_reassoc_v4f32_init: +; CHECK: // %bb.0: +; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s +; CHECK-NEXT: faddp v2.4s, v2.4s, v2.4s +; CHECK-NEXT: faddp s1, v1.2s +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: faddp s1, v2.2s +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ret + %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %i, <4 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) + %r = fadd fast float %r1, %r2 + ret float %r +} + +define float @fadd_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) { +; CHECK-LABEL: fadd_reduct_reassoc_v4v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fadd v1.4s, v1.4s, v2.4s +; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s +; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: faddp s1, v1.2s +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ret + %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %b) + %r = fadd fast float %r1, %r2 + ret float %r +} + +define double @fadd_reduct_reassoc_v4f64(<4 x double> %a, <4 x double> %b) { +; CHECK-LABEL: fadd_reduct_reassoc_v4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: fadd v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d +; CHECK-NEXT: faddp d1, v2.2d +; CHECK-NEXT: faddp d0, v0.2d +; CHECK-NEXT: fadd d0, d0, d1 +; CHECK-NEXT: ret + %r1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %a) + %r2 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %b) + %r = fadd fast double %r1, %r2 + ret double %r +} + +define float @fadd_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fadd_reduct_reassoc_v4f32_extrause: +; CHECK: // %bb.0: +; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s +; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: faddp s1, v1.2s +; CHECK-NEXT: fadd s1, s0, s1 +; CHECK-NEXT: fmul s0, s1, s0 +; CHECK-NEXT: ret + %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) + %r = fadd fast float %r1, %r2 + %p = fmul float %r, %r1 + ret float %p +} + ; Function Attrs: nounwind readnone declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>) declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>) diff --git a/llvm/test/CodeGen/RISCV/double_reduct.ll b/llvm/test/CodeGen/RISCV/double_reduct.ll new file mode 100644 index 0000000..bd910f1 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/double_reduct.ll @@ -0,0 +1,369 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+experimental-zvfh,+v,+m -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+experimental-zvfh,+v,+m -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 + +define float @add_f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: add_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vfredusum.vs v8, v8, v10 +; CHECK-NEXT: vfmv.f.s ft0, v8 +; CHECK-NEXT: vfredusum.vs v8, v9, v10 +; CHECK-NEXT: vfmv.f.s ft1, v8 +; CHECK-NEXT: fadd.s fa0, ft0, ft1 +; CHECK-NEXT: ret + %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) + %r = fadd fast float %r1, %r2 + ret float %r +} + +define float @fmul_f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fmul_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vfmul.vv v8, v8, v10 +; CHECK-NEXT: vrgather.vi v10, v8, 1 +; CHECK-NEXT: vfmul.vv v8, v8, v10 +; CHECK-NEXT: vfmv.f.s ft0, v8 +; CHECK-NEXT: vslidedown.vi v8, v9, 2 +; CHECK-NEXT: vfmul.vv v8, v9, v8 +; CHECK-NEXT: vrgather.vi v9, v8, 1 +; CHECK-NEXT: vfmul.vv v8, v8, v9 +; CHECK-NEXT: vfmv.f.s ft1, v8 +; CHECK-NEXT: fmul.s fa0, ft0, ft1 +; CHECK-NEXT: ret + %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b) + %r = fmul fast float %r1, %r2 + ret float %r +} + +define float @fmin_f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fmin_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI2_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v10, ft0 +; CHECK-NEXT: vfredmin.vs v8, v8, v10 +; CHECK-NEXT: vfmv.f.s ft0, v8 +; CHECK-NEXT: vfredmin.vs v8, v9, v10 +; CHECK-NEXT: vfmv.f.s ft1, v8 +; CHECK-NEXT: fmin.s fa0, ft0, ft1 +; CHECK-NEXT: ret + %r1 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b) + %r = call float @llvm.minnum.f32(float %r1, float %r2) + ret float %r +} + +define float @fmax_f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fmax_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI3_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v10, ft0 +; CHECK-NEXT: vfredmax.vs v8, v8, v10 +; CHECK-NEXT: vfmv.f.s ft0, v8 +; CHECK-NEXT: vfredmax.vs v8, v9, v10 +; CHECK-NEXT: vfmv.f.s ft1, v8 +; CHECK-NEXT: fmax.s fa0, ft0, ft1 +; CHECK-NEXT: ret + %r1 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b) + %r = call float @llvm.maxnum.f32(float %r1, float %r2) + ret float %r +} + + +define i32 @add_i32(<4 x i32> %a, <4 x i32> %b) { +; RV32-LABEL: add_i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.s.x v10, zero +; RV32-NEXT: vredsum.vs v8, v8, v10 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vredsum.vs v8, v9, v10 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: add_i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, zero +; RV64-NEXT: vredsum.vs v8, v8, v10 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vredsum.vs v8, v9, v10 +; RV64-NEXT: vmv.x.s a1, v8 +; RV64-NEXT: addw a0, a0, a1 +; RV64-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %b) + %r = add i32 %r1, %r2 + ret i32 %r +} + +define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: add_ext_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m1, ta, ma +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vwredsumu.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, ma +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vwredsumu.vs v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, ma +; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %ae = zext <16 x i8> %a to <16 x i16> + %be = zext <16 x i8> %b to <16 x i16> + %r1 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %ae) + %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be) + %r = add i16 %r1, %r2 + ret i16 %r +} + +define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: add_ext_v32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m1, ta, ma +; CHECK-NEXT: vmv.s.x v11, zero +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vwredsumu.vs v10, v10, v11 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, ma +; CHECK-NEXT: vmv.x.s a0, v10 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vwredsumu.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, ma +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %ae = zext <32 x i8> %a to <32 x i16> + %be = zext <16 x i8> %b to <16 x i16> + %r1 = call i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16> %ae) + %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be) + %r = add i16 %r1, %r2 + ret i16 %r +} + +define i32 @mul_i32(<4 x i32> %a, <4 x i32> %b) { +; RV32-LABEL: mul_i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vslidedown.vi v10, v8, 2 +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: vrgather.vi v10, v8, 1 +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vslidedown.vi v8, v9, 2 +; RV32-NEXT: vmul.vv v8, v9, v8 +; RV32-NEXT: vrgather.vi v9, v8, 1 +; RV32-NEXT: vmul.vv v8, v8, v9 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: mul_i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vslidedown.vi v10, v8, 2 +; RV64-NEXT: vmul.vv v8, v8, v10 +; RV64-NEXT: vrgather.vi v10, v8, 1 +; RV64-NEXT: vmul.vv v8, v8, v10 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vi v8, v9, 2 +; RV64-NEXT: vmul.vv v8, v9, v8 +; RV64-NEXT: vrgather.vi v9, v8, 1 +; RV64-NEXT: vmul.vv v8, v8, v9 +; RV64-NEXT: vmv.x.s a1, v8 +; RV64-NEXT: mulw a0, a0, a1 +; RV64-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b) + %r = mul i32 %r1, %r2 + ret i32 %r +} + +define i32 @and_i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: and_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v10, -1 +; CHECK-NEXT: vredand.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vredand.vs v8, v9, v10 +; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b) + %r = and i32 %r1, %r2 + ret i32 %r +} + +define i32 @or_i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: or_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vredor.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vredor.vs v8, v9, v10 +; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: or a0, a0, a1 +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b) + %r = or i32 %r1, %r2 + ret i32 %r +} + +define i32 @xor_i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: xor_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vredxor.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vredxor.vs v8, v9, v10 +; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: xor a0, a0, a1 +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b) + %r = xor i32 %r1, %r2 + ret i32 %r +} + +define i32 @umin_i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: umin_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v10, -1 +; CHECK-NEXT: vredminu.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vredminu.vs v8, v9, v10 +; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: bltu a0, a1, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b) + %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2) + ret i32 %r +} + +define i32 @umax_i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: umax_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vredmaxu.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vredmaxu.vs v8, v9, v10 +; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: bltu a1, a0, .LBB12_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB12_2: +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b) + %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2) + ret i32 %r +} + +define i32 @smin_i32(<4 x i32> %a, <4 x i32> %b) { +; RV32-LABEL: smin_i32: +; RV32: # %bb.0: +; RV32-NEXT: lui a0, 524288 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: vredmin.vs v8, v8, v10 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vredmin.vs v8, v9, v10 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: blt a0, a1, .LBB13_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: .LBB13_2: +; RV32-NEXT: ret +; +; RV64-LABEL: smin_i32: +; RV64: # %bb.0: +; RV64-NEXT: lui a0, 524288 +; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vredmin.vs v8, v8, v10 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vredmin.vs v8, v9, v10 +; RV64-NEXT: vmv.x.s a1, v8 +; RV64-NEXT: blt a0, a1, .LBB13_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB13_2: +; RV64-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b) + %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2) + ret i32 %r +} + +define i32 @smax_i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: smax_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vredmax.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vredmax.vs v8, v9, v10 +; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: blt a1, a0, .LBB14_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB14_2: +; CHECK-NEXT: ret + %r1 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b) + %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2) + ret i32 %r +} + +declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) +declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>) +declare i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16>) +declare i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32>) +declare float @llvm.minnum.f32(float, float) +declare float @llvm.maxnum.f32(float, float) +declare i32 @llvm.umin.i32(i32, i32) +declare i32 @llvm.umax.i32(i32, i32) +declare i32 @llvm.smin.i32(i32, i32) +declare i32 @llvm.smax.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll b/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll new file mode 100644 index 0000000..975f7b4 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll @@ -0,0 +1,329 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -float-abi=hard -verify-machineinstrs %s -o - | FileCheck %s + +define float @add_f32(<8 x float> %a, <4 x float> %b) { +; CHECK-LABEL: add_f32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vadd.f32 s4, s10, s11 +; CHECK-NEXT: vadd.f32 s2, s2, s3 +; CHECK-NEXT: vadd.f32 s0, s0, s1 +; CHECK-NEXT: vadd.f32 s6, s8, s9 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vadd.f32 s2, s6, s4 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: bx lr + %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) + %r = fadd fast float %r1, %r2 + ret float %r +} + +define float @fmul_f32(<8 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fmul_f32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmul.f32 q0, q0, q1 +; CHECK-NEXT: vmul.f32 s4, s10, s11 +; CHECK-NEXT: vmul.f32 s2, s2, s3 +; CHECK-NEXT: vmul.f32 s0, s0, s1 +; CHECK-NEXT: vmul.f32 s6, s8, s9 +; CHECK-NEXT: vmul.f32 s0, s0, s2 +; CHECK-NEXT: vmul.f32 s2, s6, s4 +; CHECK-NEXT: vmul.f32 s0, s0, s2 +; CHECK-NEXT: bx lr + %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b) + %r = fmul fast float %r1, %r2 + ret float %r +} + +define float @fmin_f32(<8 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fmin_f32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vminnm.f32 q0, q0, q1 +; CHECK-NEXT: vminnm.f32 s4, s8, s9 +; CHECK-NEXT: vminnm.f32 s2, s2, s3 +; CHECK-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NEXT: vminnm.f32 s2, s10, s11 +; CHECK-NEXT: vminnm.f32 s2, s4, s2 +; CHECK-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NEXT: bx lr + %r1 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b) + %r = call float @llvm.minnum.f32(float %r1, float %r2) + ret float %r +} + +define float @fmax_f32(<8 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fmax_f32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmaxnm.f32 q0, q0, q1 +; CHECK-NEXT: vmaxnm.f32 s4, s8, s9 +; CHECK-NEXT: vmaxnm.f32 s2, s2, s3 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NEXT: vmaxnm.f32 s2, s10, s11 +; CHECK-NEXT: vmaxnm.f32 s2, s4, s2 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NEXT: bx lr + %r1 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a) + %r2 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b) + %r = call float @llvm.maxnum.f32(float %r1, float %r2) + ret float %r +} + + +define i32 @add_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: add_i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vaddv.u32 r0, q1 +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vaddva.u32 r0, q2 +; CHECK-NEXT: bx lr + %r1 = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %b) + %r = add i32 %r1, %r2 + ret i32 %r +} + +define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: add_ext_i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vaddv.u8 r0, q1 +; CHECK-NEXT: vaddva.u8 r0, q0 +; CHECK-NEXT: bx lr + %ae = zext <16 x i8> %a to <16 x i16> + %be = zext <16 x i8> %b to <16 x i16> + %r1 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %ae) + %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be) + %r = add i16 %r1, %r2 + ret i16 %r +} + +define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: add_ext_v32i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: add r2, sp, #16 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vstrw.32 q1, [r2] +; CHECK-NEXT: vldrb.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vaddv.u16 r0, q1 +; CHECK-NEXT: vaddva.u16 r0, q0 +; CHECK-NEXT: vldrb.u16 q0, [r1, #8] +; CHECK-NEXT: vaddva.u16 r0, q0 +; CHECK-NEXT: vldrb.u16 q0, [r2, #8] +; CHECK-NEXT: vaddva.u16 r0, q0 +; CHECK-NEXT: vaddva.u8 r0, q2 +; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: bx lr + %ae = zext <32 x i8> %a to <32 x i16> + %be = zext <16 x i8> %b to <16 x i16> + %r1 = call i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16> %ae) + %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be) + %r = add i16 %r1, %r2 + ret i16 %r +} + +define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: mul_i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vmul.i32 q0, q0, q1 +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov r6, r3, d0 +; CHECK-NEXT: vmov r12, lr, d1 +; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: mul r2, r12, lr +; CHECK-NEXT: muls r3, r6, r3 +; CHECK-NEXT: mul r1, r4, r5 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: muls r0, r2, r0 +; CHECK-NEXT: pop {r4, r5, r6, pc} + %r1 = call i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b) + %r = mul i32 %r1, %r2 + ret i32 %r +} + +define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: and_i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov r6, r1, d5 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r12, lr, d1 +; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: ands r1, r6 +; CHECK-NEXT: ands r2, r3 +; CHECK-NEXT: and.w r0, r12, lr +; CHECK-NEXT: ands r0, r2 +; CHECK-NEXT: and.w r2, r4, r5 +; CHECK-NEXT: ands r1, r2 +; CHECK-NEXT: ands r0, r1 +; CHECK-NEXT: pop {r4, r5, r6, pc} + %r1 = call i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b) + %r = and i32 %r1, %r2 + ret i32 %r +} + +define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: or_i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmov r6, r1, d5 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r12, lr, d1 +; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: orrs r1, r6 +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: orr.w r0, r12, lr +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: orr.w r2, r4, r5 +; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: pop {r4, r5, r6, pc} + %r1 = call i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b) + %r = or i32 %r1, %r2 + ret i32 %r +} + +define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: xor_i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vmov r6, r1, d5 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r12, lr, d1 +; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: eors r1, r6 +; CHECK-NEXT: eors r2, r3 +; CHECK-NEXT: eor.w r0, r12, lr +; CHECK-NEXT: eors r0, r2 +; CHECK-NEXT: eor.w r2, r4, r5 +; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: eors r0, r1 +; CHECK-NEXT: pop {r4, r5, r6, pc} + %r1 = call i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b) + %r = xor i32 %r1, %r2 + ret i32 %r +} + +define i32 @umin_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: umin_i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov.w r0, #-1 +; CHECK-NEXT: vmin.u32 q0, q0, q1 +; CHECK-NEXT: mov.w r1, #-1 +; CHECK-NEXT: vminv.u32 r0, q2 +; CHECK-NEXT: vminv.u32 r1, q0 +; CHECK-NEXT: cmp r1, r0 +; CHECK-NEXT: csel r0, r1, r0, lo +; CHECK-NEXT: bx lr + %r1 = call i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b) + %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2) + ret i32 %r +} + +define i32 @umax_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: umax_i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vmax.u32 q0, q0, q1 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vmaxv.u32 r0, q2 +; CHECK-NEXT: vmaxv.u32 r1, q0 +; CHECK-NEXT: cmp r1, r0 +; CHECK-NEXT: csel r0, r1, r0, hi +; CHECK-NEXT: bx lr + %r1 = call i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b) + %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2) + ret i32 %r +} + +define i32 @smin_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: smin_i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: mvn r0, #-2147483648 +; CHECK-NEXT: vmin.s32 q0, q0, q1 +; CHECK-NEXT: mvn r1, #-2147483648 +; CHECK-NEXT: vminv.s32 r0, q2 +; CHECK-NEXT: vminv.s32 r1, q0 +; CHECK-NEXT: cmp r1, r0 +; CHECK-NEXT: csel r0, r1, r0, lt +; CHECK-NEXT: bx lr + %r1 = call i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b) + %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2) + ret i32 %r +} + +define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: smax_i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov.w r0, #-2147483648 +; CHECK-NEXT: vmax.s32 q0, q0, q1 +; CHECK-NEXT: mov.w r1, #-2147483648 +; CHECK-NEXT: vmaxv.s32 r0, q2 +; CHECK-NEXT: vmaxv.s32 r1, q0 +; CHECK-NEXT: cmp r1, r0 +; CHECK-NEXT: csel r0, r1, r0, gt +; CHECK-NEXT: bx lr + %r1 = call i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32> %a) + %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b) + %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2) + ret i32 %r +} + +declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) +declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>) +declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>) +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>) +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) +declare i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>) +declare i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16>) +declare i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32>) +declare float @llvm.minnum.f32(float, float) +declare float @llvm.maxnum.f32(float, float) +declare i32 @llvm.umin.i32(i32, i32) +declare i32 @llvm.umax.i32(i32, i32) +declare i32 @llvm.smin.i32(i32, i32) +declare i32 @llvm.smax.i32(i32, i32) |