; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 ; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 define i32 @reduce_sum_2xi32(<2 x i32> %v) { ; CHECK-LABEL: reduce_sum_2xi32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmv.s.x v9, zero ; CHECK-NEXT: vredsum.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %e0 = extractelement <2 x i32> %v, i32 0 %e1 = extractelement <2 x i32> %v, i32 1 %add0 = add i32 %e0, %e1 ret i32 %add0 } define i32 @reduce_sum_4xi32(<4 x i32> %v) { ; CHECK-LABEL: reduce_sum_4xi32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmv.s.x v9, zero ; CHECK-NEXT: vredsum.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %e0 = extractelement <4 x i32> %v, i32 0 %e1 = extractelement <4 x i32> %v, i32 1 %e2 = extractelement <4 x i32> %v, i32 2 %e3 = extractelement <4 x i32> %v, i32 3 %add0 = add i32 %e0, %e1 %add1 = add i32 %add0, %e2 %add2 = add i32 %add1, %e3 ret i32 %add2 } define i32 @reduce_sum_8xi32(<8 x i32> %v) { ; CHECK-LABEL: reduce_sum_8xi32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmv.s.x v10, zero ; CHECK-NEXT: vredsum.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %e0 = extractelement <8 x i32> %v, i32 0 %e1 = extractelement <8 x i32> %v, i32 1 %e2 = extractelement <8 x i32> %v, i32 2 %e3 = extractelement <8 x i32> %v, i32 3 %e4 = extractelement <8 x i32> %v, i32 4 %e5 = extractelement <8 x i32> %v, i32 5 %e6 = extractelement <8 x i32> %v, i32 6 %e7 = extractelement <8 x i32> %v, i32 7 %add0 = add i32 %e0, %e1 %add1 = add i32 %add0, %e2 %add2 = add i32 %add1, %e3 %add3 = add i32 %add2, %e4 %add4 = add i32 %add3, %e5 %add5 = add i32 %add4, %e6 %add6 = add i32 %add5, %e7 ret i32 %add6 } define i32 @reduce_sum_16xi32(<16 x i32> %v) { ; CHECK-LABEL: reduce_sum_16xi32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vmv.s.x v12, zero ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %e3 = extractelement <16 x i32> %v, i32 3 %e4 = extractelement <16 x i32> %v, i32 4 %e5 = extractelement <16 x i32> %v, i32 5 %e6 = extractelement <16 x i32> %v, i32 6 %e7 = extractelement <16 x i32> %v, i32 7 %e8 = extractelement <16 x i32> %v, i32 8 %e9 = extractelement <16 x i32> %v, i32 9 %e10 = extractelement <16 x i32> %v, i32 10 %e11 = extractelement <16 x i32> %v, i32 11 %e12 = extractelement <16 x i32> %v, i32 12 %e13 = extractelement <16 x i32> %v, i32 13 %e14 = extractelement <16 x i32> %v, i32 14 %e15 = extractelement <16 x i32> %v, i32 15 %add0 = add i32 %e0, %e1 %add1 = add i32 %add0, %e2 %add2 = add i32 %add1, %e3 %add3 = add i32 %add2, %e4 %add4 = add i32 %add3, %e5 %add5 = add i32 %add4, %e6 %add6 = add i32 %add5, %e7 %add7 = add i32 %add6, %e8 %add8 = add i32 %add7, %e9 %add9 = add i32 %add8, %e10 %add10 = add i32 %add9, %e11 %add11 = add i32 %add10, %e12 %add12 = add i32 %add11, %e13 %add13 = add i32 %add12, %e14 %add14 = add i32 %add13, %e15 ret i32 %add14 } define i32 @reduce_sum_16xi32_prefix2(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v9, zero ; CHECK-NEXT: vredsum.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %add0 = add i32 %e0, %e1 ret i32 %add0 } define i32 @reduce_sum_16xi32_prefix3(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix3: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v9, zero ; CHECK-NEXT: vredsum.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %add0 = add i32 %e0, %e1 %add1 = add i32 %add0, %e2 ret i32 %add1 } define i32 @reduce_sum_16xi32_prefix4(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix4: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v9, zero ; CHECK-NEXT: vredsum.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %e3 = extractelement <16 x i32> %v, i32 3 %add0 = add i32 %e0, %e1 %add1 = add i32 %add0, %e2 %add2 = add i32 %add1, %e3 ret i32 %add2 } define i32 @reduce_sum_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix5: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v10, zero ; CHECK-NEXT: vredsum.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %e3 = extractelement <16 x i32> %v, i32 3 %e4 = extractelement <16 x i32> %v, i32 4 %add0 = add i32 %e0, %e1 %add1 = add i32 %add0, %e2 %add2 = add i32 %add1, %e3 %add3 = add i32 %add2, %e4 ret i32 %add3 } define i32 @reduce_sum_16xi32_prefix6(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix6: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v10, zero ; CHECK-NEXT: vredsum.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %e3 = extractelement <16 x i32> %v, i32 3 %e4 = extractelement <16 x i32> %v, i32 4 %e5 = extractelement <16 x i32> %v, i32 5 %add0 = add i32 %e0, %e1 %add1 = add i32 %add0, %e2 %add2 = add i32 %add1, %e3 %add3 = add i32 %add2, %e4 %add4 = add i32 %add3, %e5 ret i32 %add4 } define i32 @reduce_sum_16xi32_prefix7(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix7: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v10, zero ; CHECK-NEXT: vredsum.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %e3 = extractelement <16 x i32> %v, i32 3 %e4 = extractelement <16 x i32> %v, i32 4 %e5 = extractelement <16 x i32> %v, i32 5 %e6 = extractelement <16 x i32> %v, i32 6 %add0 = add i32 %e0, %e1 %add1 = add i32 %add0, %e2 %add2 = add i32 %add1, %e3 %add3 = add i32 %add2, %e4 %add4 = add i32 %add3, %e5 %add5 = add i32 %add4, %e6 ret i32 %add5 } define i32 @reduce_sum_16xi32_prefix8(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v10, zero ; CHECK-NEXT: vredsum.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %e3 = extractelement <16 x i32> %v, i32 3 %e4 = extractelement <16 x i32> %v, i32 4 %e5 = extractelement <16 x i32> %v, i32 5 %e6 = extractelement <16 x i32> %v, i32 6 %e7 = extractelement <16 x i32> %v, i32 7 %add0 = add i32 %e0, %e1 %add1 = add i32 %add0, %e2 %add2 = add i32 %add1, %e3 %add3 = add i32 %add2, %e4 %add4 = add i32 %add3, %e5 %add5 = add i32 %add4, %e6 %add6 = add i32 %add5, %e7 ret i32 %add6 } define i32 @reduce_sum_16xi32_prefix9(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix9: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 9, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v12, zero ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %e3 = extractelement <16 x i32> %v, i32 3 %e4 = extractelement <16 x i32> %v, i32 4 %e5 = extractelement <16 x i32> %v, i32 5 %e6 = extractelement <16 x i32> %v, i32 6 %e7 = extractelement <16 x i32> %v, i32 7 %e8 = extractelement <16 x i32> %v, i32 8 %add0 = add i32 %e0, %e1 %add1 = add i32 %add0, %e2 %add2 = add i32 %add1, %e3 %add3 = add i32 %add2, %e4 %add4 = add i32 %add3, %e5 %add5 = add i32 %add4, %e6 %add6 = add i32 %add5, %e7 %add7 = add i32 %add6, %e8 ret i32 %add7 } define i32 @reduce_sum_16xi32_prefix13(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix13: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 13, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v12, zero ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %e3 = extractelement <16 x i32> %v, i32 3 %e4 = extractelement <16 x i32> %v, i32 4 %e5 = extractelement <16 x i32> %v, i32 5 %e6 = extractelement <16 x i32> %v, i32 6 %e7 = extractelement <16 x i32> %v, i32 7 %e8 = extractelement <16 x i32> %v, i32 8 %e9 = extractelement <16 x i32> %v, i32 9 %e10 = extractelement <16 x i32> %v, i32 10 %e11 = extractelement <16 x i32> %v, i32 11 %e12 = extractelement <16 x i32> %v, i32 12 %add0 = add i32 %e0, %e1 %add1 = add i32 %add0, %e2 %add2 = add i32 %add1, %e3 %add3 = add i32 %add2, %e4 %add4 = add i32 %add3, %e5 %add5 = add i32 %add4, %e6 %add6 = add i32 %add5, %e7 %add7 = add i32 %add6, %e8 %add8 = add i32 %add7, %e9 %add9 = add i32 %add8, %e10 %add10 = add i32 %add9, %e11 %add11 = add i32 %add10, %e12 ret i32 %add11 } define i32 @reduce_sum_16xi32_prefix14(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix14: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 14, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v12, zero ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %e3 = extractelement <16 x i32> %v, i32 3 %e4 = extractelement <16 x i32> %v, i32 4 %e5 = extractelement <16 x i32> %v, i32 5 %e6 = extractelement <16 x i32> %v, i32 6 %e7 = extractelement <16 x i32> %v, i32 7 %e8 = extractelement <16 x i32> %v, i32 8 %e9 = extractelement <16 x i32> %v, i32 9 %e10 = extractelement <16 x i32> %v, i32 10 %e11 = extractelement <16 x i32> %v, i32 11 %e12 = extractelement <16 x i32> %v, i32 12 %e13 = extractelement <16 x i32> %v, i32 13 %add0 = add i32 %e0, %e1 %add1 = add i32 %add0, %e2 %add2 = add i32 %add1, %e3 %add3 = add i32 %add2, %e4 %add4 = add i32 %add3, %e5 %add5 = add i32 %add4, %e6 %add6 = add i32 %add5, %e7 %add7 = add i32 %add6, %e8 %add8 = add i32 %add7, %e9 %add9 = add i32 %add8, %e10 %add10 = add i32 %add9, %e11 %add11 = add i32 %add10, %e12 %add12 = add i32 %add11, %e13 ret i32 %add12 } define i32 @reduce_sum_16xi32_prefix15(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix15: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 15, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v12, zero ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %e3 = extractelement <16 x i32> %v, i32 3 %e4 = extractelement <16 x i32> %v, i32 4 %e5 = extractelement <16 x i32> %v, i32 5 %e6 = extractelement <16 x i32> %v, i32 6 %e7 = extractelement <16 x i32> %v, i32 7 %e8 = extractelement <16 x i32> %v, i32 8 %e9 = extractelement <16 x i32> %v, i32 9 %e10 = extractelement <16 x i32> %v, i32 10 %e11 = extractelement <16 x i32> %v, i32 11 %e12 = extractelement <16 x i32> %v, i32 12 %e13 = extractelement <16 x i32> %v, i32 13 %e14 = extractelement <16 x i32> %v, i32 14 %add0 = add i32 %e0, %e1 %add1 = add i32 %add0, %e2 %add2 = add i32 %add1, %e3 %add3 = add i32 %add2, %e4 %add4 = add i32 %add3, %e5 %add5 = add i32 %add4, %e6 %add6 = add i32 %add5, %e7 %add7 = add i32 %add6, %e8 %add8 = add i32 %add7, %e9 %add9 = add i32 %add8, %e10 %add10 = add i32 %add9, %e11 %add11 = add i32 %add10, %e12 %add12 = add i32 %add11, %e13 %add13 = add i32 %add12, %e14 ret i32 %add13 } ; Check that we can match with the operand ordered reversed, but the ; reduction order unchanged. define i32 @reduce_sum_4xi32_op_order(<4 x i32> %v) { ; CHECK-LABEL: reduce_sum_4xi32_op_order: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmv.s.x v9, zero ; CHECK-NEXT: vredsum.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %e0 = extractelement <4 x i32> %v, i32 0 %e1 = extractelement <4 x i32> %v, i32 1 %e2 = extractelement <4 x i32> %v, i32 2 %e3 = extractelement <4 x i32> %v, i32 3 %add0 = add i32 %e1, %e0 %add1 = add i32 %e2, %add0 %add2 = add i32 %add1, %e3 ret i32 %add2 } ; Negative test - Reduction order isn't compatibile with current ; incremental matching scheme. define i32 @reduce_sum_4xi32_reduce_order(<4 x i32> %v) { ; RV32-LABEL: reduce_sum_4xi32_reduce_order: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vslidedown.vi v9, v8, 1 ; RV32-NEXT: vmv.x.s a1, v9 ; RV32-NEXT: vslidedown.vi v9, v8, 2 ; RV32-NEXT: vslidedown.vi v8, v8, 3 ; RV32-NEXT: vmv.x.s a2, v9 ; RV32-NEXT: vmv.x.s a3, v8 ; RV32-NEXT: add a1, a1, a2 ; RV32-NEXT: add a0, a0, a3 ; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: reduce_sum_4xi32_reduce_order: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: vslidedown.vi v9, v8, 1 ; RV64-NEXT: vmv.x.s a1, v9 ; RV64-NEXT: vslidedown.vi v9, v8, 2 ; RV64-NEXT: vslidedown.vi v8, v8, 3 ; RV64-NEXT: vmv.x.s a2, v9 ; RV64-NEXT: vmv.x.s a3, v8 ; RV64-NEXT: add a1, a1, a2 ; RV64-NEXT: add a0, a0, a3 ; RV64-NEXT: addw a0, a0, a1 ; RV64-NEXT: ret %e0 = extractelement <4 x i32> %v, i32 0 %e1 = extractelement <4 x i32> %v, i32 1 %e2 = extractelement <4 x i32> %v, i32 2 %e3 = extractelement <4 x i32> %v, i32 3 %add0 = add i32 %e1, %e2 %add1 = add i32 %e0, %add0 %add2 = add i32 %add1, %e3 ret i32 %add2 } ;; Most of the cornercases are exercised above, the following just ;; makes sure that other opcodes work as expected. define i32 @reduce_xor_16xi32_prefix2(ptr %p) { ; CHECK-LABEL: reduce_xor_16xi32_prefix2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v9, zero ; CHECK-NEXT: vredxor.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %xor0 = xor i32 %e0, %e1 ret i32 %xor0 } define i32 @reduce_xor_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_xor_16xi32_prefix5: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v10, zero ; CHECK-NEXT: vredxor.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %e3 = extractelement <16 x i32> %v, i32 3 %e4 = extractelement <16 x i32> %v, i32 4 %xor0 = xor i32 %e0, %e1 %xor1 = xor i32 %xor0, %e2 %xor2 = xor i32 %xor1, %e3 %xor3 = xor i32 %xor2, %e4 ret i32 %xor3 } define i32 @reduce_and_16xi32_prefix2(ptr %p) { ; CHECK-LABEL: reduce_and_16xi32_prefix2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vredand.vs v8, v8, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %and0 = and i32 %e0, %e1 ret i32 %and0 } define i32 @reduce_and_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_and_16xi32_prefix5: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vmv.v.i v10, -1 ; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; CHECK-NEXT: vredand.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %e3 = extractelement <16 x i32> %v, i32 3 %e4 = extractelement <16 x i32> %v, i32 4 %and0 = and i32 %e0, %e1 %and1 = and i32 %and0, %e2 %and2 = and i32 %and1, %e3 %and3 = and i32 %and2, %e4 ret i32 %and3 } define i32 @reduce_or_16xi32_prefix2(ptr %p) { ; CHECK-LABEL: reduce_or_16xi32_prefix2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vredor.vs v8, v8, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %or0 = or i32 %e0, %e1 ret i32 %or0 } define i32 @reduce_or_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_or_16xi32_prefix5: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v10, zero ; CHECK-NEXT: vredor.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %e3 = extractelement <16 x i32> %v, i32 3 %e4 = extractelement <16 x i32> %v, i32 4 %or0 = or i32 %e0, %e1 %or1 = or i32 %or0, %e2 %or2 = or i32 %or1, %e3 %or3 = or i32 %or2, %e4 ret i32 %or3 } declare i32 @llvm.smax.i32(i32 %a, i32 %b) declare i32 @llvm.smin.i32(i32 %a, i32 %b) declare i32 @llvm.umax.i32(i32 %a, i32 %b) declare i32 @llvm.umin.i32(i32 %a, i32 %b) define i32 @reduce_smax_16xi32_prefix2(ptr %p) { ; CHECK-LABEL: reduce_smax_16xi32_prefix2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vredmax.vs v8, v8, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1) ret i32 %smax0 } define i32 @reduce_smax_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_smax_16xi32_prefix5: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 524288 ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vredmax.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %e3 = extractelement <16 x i32> %v, i32 3 %e4 = extractelement <16 x i32> %v, i32 4 %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1) %smax1 = call i32 @llvm.smax.i32(i32 %smax0, i32 %e2) %smax2 = call i32 @llvm.smax.i32(i32 %smax1, i32 %e3) %smax3 = call i32 @llvm.smax.i32(i32 %smax2, i32 %e4) ret i32 %smax3 } define i32 @reduce_smin_16xi32_prefix2(ptr %p) { ; CHECK-LABEL: reduce_smin_16xi32_prefix2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vredmin.vs v8, v8, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1) ret i32 %smin0 } define i32 @reduce_smin_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_smin_16xi32_prefix5: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 524288 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vredmin.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %e3 = extractelement <16 x i32> %v, i32 3 %e4 = extractelement <16 x i32> %v, i32 4 %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1) %smin1 = call i32 @llvm.smin.i32(i32 %smin0, i32 %e2) %smin2 = call i32 @llvm.smin.i32(i32 %smin1, i32 %e3) %smin3 = call i32 @llvm.smin.i32(i32 %smin2, i32 %e4) ret i32 %smin3 } define i32 @reduce_umax_16xi32_prefix2(ptr %p) { ; CHECK-LABEL: reduce_umax_16xi32_prefix2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vredmaxu.vs v8, v8, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1) ret i32 %umax0 } define i32 @reduce_umax_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_umax_16xi32_prefix5: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v10, zero ; CHECK-NEXT: vredmaxu.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %e3 = extractelement <16 x i32> %v, i32 3 %e4 = extractelement <16 x i32> %v, i32 4 %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1) %umax1 = call i32 @llvm.umax.i32(i32 %umax0, i32 %e2) %umax2 = call i32 @llvm.umax.i32(i32 %umax1, i32 %e3) %umax3 = call i32 @llvm.umax.i32(i32 %umax2, i32 %e4) ret i32 %umax3 } define i32 @reduce_umin_16xi32_prefix2(ptr %p) { ; CHECK-LABEL: reduce_umin_16xi32_prefix2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vredminu.vs v8, v8, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1) ret i32 %umin0 } define i32 @reduce_umin_16xi32_prefix5(ptr %p) { ; RV32-LABEL: reduce_umin_16xi32_prefix5: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32-NEXT: vmv.v.i v10, -1 ; RV32-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; RV32-NEXT: vredminu.vs v8, v8, v10 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: reduce_umin_16xi32_prefix5: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; RV64-NEXT: vle32.v v8, (a0) ; RV64-NEXT: li a0, -1 ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vredminu.vs v8, v8, v10 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 %e2 = extractelement <16 x i32> %v, i32 2 %e3 = extractelement <16 x i32> %v, i32 3 %e4 = extractelement <16 x i32> %v, i32 4 %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1) %umin1 = call i32 @llvm.umin.i32(i32 %umin0, i32 %e2) %umin2 = call i32 @llvm.umin.i32(i32 %umin1, i32 %e3) %umin3 = call i32 @llvm.umin.i32(i32 %umin2, i32 %e4) ret i32 %umin3 } define float @reduce_fadd_16xf32_prefix2(ptr %p) { ; CHECK-LABEL: reduce_fadd_16xf32_prefix2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v9, zero ; CHECK-NEXT: vfredusum.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x float>, ptr %p, align 256 %e0 = extractelement <16 x float> %v, i32 0 %e1 = extractelement <16 x float> %v, i32 1 %fadd0 = fadd fast float %e0, %e1 ret float %fadd0 } define float @reduce_fadd_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_fadd_16xi32_prefix5: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 524288 ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vfredusum.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x float>, ptr %p, align 256 %e0 = extractelement <16 x float> %v, i32 0 %e1 = extractelement <16 x float> %v, i32 1 %e2 = extractelement <16 x float> %v, i32 2 %e3 = extractelement <16 x float> %v, i32 3 %e4 = extractelement <16 x float> %v, i32 4 %fadd0 = fadd fast float %e0, %e1 %fadd1 = fadd fast float %fadd0, %e2 %fadd2 = fadd fast float %fadd1, %e3 %fadd3 = fadd fast float %fadd2, %e4 ret float %fadd3 } ;; Corner case tests for fadd associativity ; Negative test, not associative. Would need strict opcode. define float @reduce_fadd_2xf32_non_associative(ptr %p) { ; CHECK-LABEL: reduce_fadd_2xf32_non_associative: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: vslidedown.vi v8, v8, 1 ; CHECK-NEXT: vfmv.f.s fa4, v8 ; CHECK-NEXT: fadd.s fa0, fa5, fa4 ; CHECK-NEXT: ret %v = load <2 x float>, ptr %p, align 256 %e0 = extractelement <2 x float> %v, i32 0 %e1 = extractelement <2 x float> %v, i32 1 %fadd0 = fadd float %e0, %e1 ret float %fadd0 } ; Positive test - minimal set of fast math flags define float @reduce_fadd_2xf32_reassoc_only(ptr %p) { ; CHECK-LABEL: reduce_fadd_2xf32_reassoc_only: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 524288 ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vfredusum.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <2 x float>, ptr %p, align 256 %e0 = extractelement <2 x float> %v, i32 0 %e1 = extractelement <2 x float> %v, i32 1 %fadd0 = fadd reassoc float %e0, %e1 ret float %fadd0 } ; Negative test - wrong fast math flag. define float @reduce_fadd_2xf32_ninf_only(ptr %p) { ; CHECK-LABEL: reduce_fadd_2xf32_ninf_only: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: vslidedown.vi v8, v8, 1 ; CHECK-NEXT: vfmv.f.s fa4, v8 ; CHECK-NEXT: fadd.s fa0, fa5, fa4 ; CHECK-NEXT: ret %v = load <2 x float>, ptr %p, align 256 %e0 = extractelement <2 x float> %v, i32 0 %e1 = extractelement <2 x float> %v, i32 1 %fadd0 = fadd ninf float %e0, %e1 ret float %fadd0 } ; Negative test - last fadd is not associative define float @reduce_fadd_4xi32_non_associative(ptr %p) { ; CHECK-LABEL: reduce_fadd_4xi32_non_associative: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 524288 ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma ; CHECK-NEXT: vfredusum.vs v9, v8, v9 ; CHECK-NEXT: vslidedown.vi v8, v8, 3 ; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: vfmv.f.s fa4, v9 ; CHECK-NEXT: fadd.s fa0, fa4, fa5 ; CHECK-NEXT: ret %v = load <4 x float>, ptr %p, align 256 %e0 = extractelement <4 x float> %v, i32 0 %e1 = extractelement <4 x float> %v, i32 1 %e2 = extractelement <4 x float> %v, i32 2 %e3 = extractelement <4 x float> %v, i32 3 %fadd0 = fadd fast float %e0, %e1 %fadd1 = fadd fast float %fadd0, %e2 %fadd2 = fadd float %fadd1, %e3 ret float %fadd2 } ; Negative test - first fadd is not associative ; We could form a reduce for elements 2 and 3. define float @reduce_fadd_4xi32_non_associative2(ptr %p) { ; CHECK-LABEL: reduce_fadd_4xi32_non_associative2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: vslidedown.vi v9, v8, 1 ; CHECK-NEXT: vfmv.f.s fa4, v9 ; CHECK-NEXT: vslidedown.vi v9, v8, 2 ; CHECK-NEXT: vslidedown.vi v8, v8, 3 ; CHECK-NEXT: vfmv.f.s fa3, v9 ; CHECK-NEXT: vfmv.f.s fa2, v8 ; CHECK-NEXT: fadd.s fa5, fa5, fa4 ; CHECK-NEXT: fadd.s fa4, fa3, fa2 ; CHECK-NEXT: fadd.s fa0, fa5, fa4 ; CHECK-NEXT: ret %v = load <4 x float>, ptr %p, align 256 %e0 = extractelement <4 x float> %v, i32 0 %e1 = extractelement <4 x float> %v, i32 1 %e2 = extractelement <4 x float> %v, i32 2 %e3 = extractelement <4 x float> %v, i32 3 %fadd0 = fadd float %e0, %e1 %fadd1 = fadd fast float %fadd0, %e2 %fadd2 = fadd fast float %fadd1, %e3 ret float %fadd2 } define float @reduce_fmaxnum_16xf32_prefix2(ptr %p) { ; CHECK-LABEL: reduce_fmaxnum_16xf32_prefix2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x float>, ptr %p, align 256 %e0 = extractelement <16 x float> %v, i32 0 %e1 = extractelement <16 x float> %v, i32 1 %fmax0 = call float @llvm.maxnum.f32(float %e0, float %e1) ret float %fmax0 } define float @reduce_fminnum_16xf32_prefix2(ptr %p) { ; CHECK-LABEL: reduce_fminnum_16xf32_prefix2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x float>, ptr %p, align 256 %e0 = extractelement <16 x float> %v, i32 0 %e1 = extractelement <16 x float> %v, i32 1 %fmax0 = call float @llvm.minnum.f32(float %e0, float %e1) ret float %fmax0 }