diff options
author | Kyrylo Tkachov <kyrylo.tkachov@arm.com> | 2023-06-06 09:54:41 +0100 |
---|---|---|
committer | Kyrylo Tkachov <kyrylo.tkachov@arm.com> | 2023-06-06 09:54:41 +0100 |
commit | b327cbe8f4eefc91ee2bea49a1da7128adf30281 (patch) | |
tree | 02d04fdafaa0319b916927bdc61e7ce47e3e8f01 /gcc | |
parent | 84eec2916fa68cd2e2b3a2cf764f2ba595cce843 (diff) | |
download | gcc-b327cbe8f4eefc91ee2bea49a1da7128adf30281.zip gcc-b327cbe8f4eefc91ee2bea49a1da7128adf30281.tar.gz gcc-b327cbe8f4eefc91ee2bea49a1da7128adf30281.tar.bz2 |
aarch64: Improve representation of ADDLV instructions
We've received requests to optimise the attached intrinsics testcase.
We currently generate:
foo_1:
uaddlp v0.4s, v0.8h
uaddlv d31, v0.4s
fmov x0, d31
ret
foo_2:
uaddlp v0.4s, v0.8h
addv s31, v0.4s
fmov w0, s31
ret
foo_3:
saddlp v0.4s, v0.8h
addv s31, v0.4s
fmov w0, s31
ret
The widening pair-wise addition addlp instructions can be omitted if we're just doing an ADDV afterwards.
Making this optimisation would be quite simple if we had a standard RTL PLUS vector reduction code.
As we don't, we can use UNSPEC_ADDV as a stand in.
This patch expresses the SADDLV and UADDLV instructions as an UNSPEC_ADDV over a widened input, thus removing
the need for separate UNSPEC_SADDLV and UNSPEC_UADDLV codes.
To optimise the testcases involved we add two splitters that match a vector addition where all participating elements
are taken and widened from the same vector and then fed into an UNSPEC_ADDV. In that case we can just remove the
vector PLUS and just emit the simple RTL for SADDLV/UADDLV.
Bootstrapped and tested on aarch64-none-linux-gnu.
gcc/ChangeLog:
* config/aarch64/aarch64-protos.h (aarch64_parallel_select_half_p):
Define prototype.
(aarch64_pars_overlap_p): Likewise.
* config/aarch64/aarch64-simd.md (aarch64_<su>addlv<mode>):
Express in terms of UNSPEC_ADDV.
(*aarch64_<su>addlv<VDQV_L:mode>_ze<GPI:mode>): Likewise.
(*aarch64_<su>addlv<mode>_reduction): Define.
(*aarch64_uaddlv<mode>_reduction_2): Likewise.
* config/aarch64/aarch64.cc (aarch64_parallel_select_half_p): Define.
(aarch64_pars_overlap_p): Likewise.
* config/aarch64/iterators.md (UNSPEC_SADDLV, UNSPEC_UADDLV): Delete.
(VQUADW): New mode attribute.
(VWIDE2X_S): Likewise.
(USADDLV): Delete.
(su): Delete handling of UNSPEC_SADDLV, UNSPEC_UADDLV.
* config/aarch64/predicates.md (vect_par_cnst_select_half): Define.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/addlv_1.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/aarch64/aarch64-protos.h | 2 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 69 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64.cc | 44 | ||||
-rw-r--r-- | gcc/config/aarch64/iterators.md | 12 | ||||
-rw-r--r-- | gcc/config/aarch64/predicates.md | 9 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/simd/addlv_1.c | 43 |
6 files changed, 168 insertions, 11 deletions
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index a0642df..a20a20c 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -819,6 +819,8 @@ bool aarch64_regno_ok_for_index_p (int, bool); bool aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *fail); bool aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode, bool high); +bool aarch64_parallel_select_half_p (machine_mode, rtx); +bool aarch64_pars_overlap_p (rtx, rtx); bool aarch64_simd_scalar_immediate_valid_for_move (rtx, scalar_int_mode); bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool); bool aarch64_sve_ptrue_svpattern_p (rtx, struct simd_immediate_info *); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index a567f01..3b79e24 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3665,15 +3665,73 @@ DONE; }) +;; SADDLV and UADDLV can be expressed as an ADDV instruction that first +;; sign or zero-extends its elements. (define_insn "aarch64_<su>addlv<mode>" [(set (match_operand:<VWIDE_S> 0 "register_operand" "=w") - (unspec:<VWIDE_S> [(match_operand:VDQV_L 1 "register_operand" "w")] - USADDLV))] + (unspec:<VWIDE_S> + [(ANY_EXTEND:<V2XWIDE> + (match_operand:VDQV_L 1 "register_operand" "w"))] + UNSPEC_ADDV))] "TARGET_SIMD" "<su>addl<vp>\\t%<Vwstype>0<Vwsuf>, %1.<Vtype>" [(set_attr "type" "neon_reduc_add<q>")] ) +;; An ADDV over a vector PLUS of elements extracted and widened all from the +;; same vector is the same as an [SU]ADDLV above, so long as all the elements +;; of that vector are used. We can greatly simplify the RTL expression using +;; this splitter. +(define_insn_and_split "*aarch64_<su>addlv<mode>_reduction" + [(set (match_operand:<VWIDE_S> 0 "register_operand") + (unspec:<VWIDE_S> + [(plus:<VDBLW> + (vec_select:<VDBLW> + (ANY_EXTEND:<V2XWIDE> + (match_operand:VDQV_L 1 "register_operand")) + (match_operand:<V2XWIDE> 2 "vect_par_cnst_select_half")) + (vec_select:<VDBLW> (ANY_EXTEND:<V2XWIDE> (match_dup 1)) + (match_operand:<V2XWIDE> 3 "vect_par_cnst_select_half")))] + UNSPEC_ADDV))] + "TARGET_SIMD && !aarch64_pars_overlap_p (operands[2], operands[3])" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:<VWIDE_S> + [(ANY_EXTEND:<V2XWIDE> + (match_dup 1))] + UNSPEC_ADDV))] + {} +) + +;; Similar to the above but for two-step zero-widening reductions. +;; We can push the outer zero_extend outside the ADDV unspec and make +;; use of the implicit high-part zeroing semantics of UADDLV to do it all +;; in a single instruction. +(define_insn_and_split "*aarch64_uaddlv<mode>_reduction_2" + [(set (match_operand:<VWIDE2X_S> 0 "register_operand" "=w") + (unspec:<VWIDE2X_S> + [(zero_extend:<VQUADW> + (plus:<VDBLW> + (vec_select:<VDBLW> + (zero_extend:<V2XWIDE> + (match_operand:VDQQH 1 "register_operand" "w")) + (match_operand:<V2XWIDE> 2 "vect_par_cnst_select_half")) + (vec_select:<VDBLW> (zero_extend:<V2XWIDE> (match_dup 1)) + (match_operand:<V2XWIDE> 3 "vect_par_cnst_select_half"))))] + UNSPEC_ADDV))] + "TARGET_SIMD && !aarch64_pars_overlap_p (operands[2], operands[3])" + "#" + "&& 1" + [(set (match_dup 0) + (zero_extend:<VWIDE2X_S> + (unspec:<VWIDE_S> + [(zero_extend:<V2XWIDE> + (match_dup 1))] + UNSPEC_ADDV)))] + {} +) + ;; Zero-extending version of the above. As these intrinsics produce a scalar ;; value that may be used by further intrinsics we want to avoid moving the ;; result into GP regs to do a zero-extension that ADDLV/ADDLP gives for free. @@ -3681,9 +3739,10 @@ (define_insn "*aarch64_<su>addlv<VDQV_L:mode>_ze<GPI:mode>" [(set (match_operand:GPI 0 "register_operand" "=w") (zero_extend:GPI - (unspec:<VWIDE_S> - [(match_operand:VDQV_L 1 "register_operand" "w")] - USADDLV)))] + (unspec:<VWIDE_S> + [(ANY_EXTEND:<VDQV_L:V2XWIDE> + (match_operand:VDQV_L 1 "register_operand" "w"))] + UNSPEC_ADDV)))] "TARGET_SIMD && (GET_MODE_SIZE (<GPI:MODE>mode) > GET_MODE_SIZE (<VWIDE_S>mode))" "<su>addl<VDQV_L:vp>\\t%<VDQV_L:Vwstype>0<VDQV_L:Vwsuf>, %1.<VDQV_L:Vtype>" diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 44935e8..1f1f27e 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -27717,6 +27717,50 @@ aarch64_adjust_reg_alloc_order () reg_alloc_order[i] = i; } +/* Return true if the PARALLEL PAR can be used in a VEC_SELECT expression + of vector mode MODE to select half the elements of that vector. + Allow any combination of indices except duplicates (or out of range of + the mode units). */ + +bool +aarch64_parallel_select_half_p (machine_mode mode, rtx par) +{ + int nunits = XVECLEN (par, 0); + if (!known_eq (GET_MODE_NUNITS (mode), nunits * 2)) + return false; + int mode_nunits = nunits * 2; + /* Put all the elements of PAR into a hash_set and use its + uniqueness guarantees to check that we don't try to insert the same + element twice. */ + hash_set<rtx> parset; + for (int i = 0; i < nunits; ++i) + { + rtx elt = XVECEXP (par, 0, i); + if (!CONST_INT_P (elt) + || !IN_RANGE (INTVAL (elt), 0, mode_nunits - 1) + || parset.add (elt)) + return false; + } + return true; +} + +/* Return true if PAR1 and PAR2, two PARALLEL rtxes of CONST_INT values, + contain any common elements. */ + +bool +aarch64_pars_overlap_p (rtx par1, rtx par2) +{ + int len1 = XVECLEN (par1, 0); + int len2 = XVECLEN (par2, 0); + hash_set<rtx> parset; + for (int i = 0; i < len1; ++i) + parset.add (XVECEXP (par1, 0, i)); + for (int i = 0; i < len2; ++i) + if (parset.contains (XVECEXP (par2, 0, i))) + return true; + return false; +} + /* Target-specific selftests. */ #if CHECKING_P diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index d9c7354..9e1e17b 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -622,8 +622,6 @@ UNSPEC_FMINV ; Used in aarch64-simd.md. UNSPEC_FADDV ; Used in aarch64-simd.md. UNSPEC_ADDV ; Used in aarch64-simd.md. - UNSPEC_SADDLV ; Used in aarch64-simd.md. - UNSPEC_UADDLV ; Used in aarch64-simd.md. UNSPEC_SMAXV ; Used in aarch64-simd.md. UNSPEC_SMINV ; Used in aarch64-simd.md. UNSPEC_UMAXV ; Used in aarch64-simd.md. @@ -1482,6 +1480,9 @@ (V4HI "V2SI") (V8HI "V4SI") (V2SI "DI") (V4SI "V2DI")]) +(define_mode_attr VQUADW [(V8QI "V4SI") (V16QI "V8SI") + (V4HI "V2DI") (V8HI "V4DI")]) + ;; Narrowed modes for VDN. (define_mode_attr VNARROWD [(V4HI "V8QI") (V2SI "V4HI") (DI "V2SI")]) @@ -1563,6 +1564,9 @@ (V2SI "DI") (V16QI "HI") (V8HI "SI") (V4SI "DI")]) +(define_mode_attr VWIDE2X_S [(V8QI "SI") (V4HI "DI") + (V16QI "SI") (V8HI "DI")]) + ;; Widened mode with half the element register suffixes for VD_BHSI/VQW/VQ_HSF. (define_mode_attr Vwhalf [(V8QI "4h") (V4HI "2s") (V2SI "1d") (V16QI "8h") @@ -2589,8 +2593,6 @@ (define_int_iterator SVE_INT_ADDV [UNSPEC_SADDV UNSPEC_UADDV]) -(define_int_iterator USADDLV [UNSPEC_SADDLV UNSPEC_UADDLV]) - (define_int_iterator LOGICALF [UNSPEC_ANDF UNSPEC_IORF UNSPEC_XORF]) (define_int_iterator HADDSUB [UNSPEC_SHADD UNSPEC_UHADD @@ -3332,8 +3334,6 @@ ;; "s" for signed operations and "u" for unsigned ones. (define_int_attr su [(UNSPEC_SADDV "s") (UNSPEC_UADDV "u") - (UNSPEC_SADDLV "s") - (UNSPEC_UADDLV "u") (UNSPEC_UNPACKSHI "s") (UNSPEC_UNPACKUHI "u") (UNSPEC_UNPACKSLO "s") diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index 3cbc735..d93fd86 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -475,6 +475,15 @@ && aarch64_stepped_int_parallel_p (op, 2); }) +;; PARALLEL for a vec_select that selects half the elements in a vector of +;; MODE. Allows any combination of elements, as long as there's no +;; duplicate entries. +(define_special_predicate "vect_par_cnst_select_half" + (match_code "parallel") +{ + return aarch64_parallel_select_half_p (mode, op); +}) + (define_predicate "descending_int_parallel" (match_code "parallel") { diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addlv_1.c b/gcc/testsuite/gcc.target/aarch64/simd/addlv_1.c new file mode 100644 index 0000000..21fbdb3 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/addlv_1.c @@ -0,0 +1,43 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ +#include <arm_neon.h> + +/* +** foo_1: +** uaddlv s([0-9]+), v0.8h +** fmov x0, d\1 +** ret +*/ + +uint64_t +foo_1 (uint16x8_t b) +{ + return vaddlvq_u32 (vpadalq_u16 (vdupq_n_u32 (0), b)); +} + +/* +** foo_2: +** uaddlv s([0-9]+), v0.8h +** fmov w0, s\1 +** ret +*/ + +uint32_t +foo_2 (uint16x8_t b) +{ + return vaddvq_u32 (vpadalq_u16 (vdupq_n_u32 (0), b)); +} + +/* +** foo_3: +** saddlv s([0-9]+), v0.8h +** fmov w0, s\1 +** ret +*/ + +int32_t +foo_3 (int16x8_t b) +{ + return vaddvq_s32 (vpadalq_s16 (vdupq_n_s32 (0), b)); +} |