diff options
author | Kyrylo Tkachov <kyrylo.tkachov@arm.com> | 2023-05-04 15:19:52 +0100 |
---|---|---|
committer | Kyrylo Tkachov <kyrylo.tkachov@arm.com> | 2023-05-04 15:19:52 +0100 |
commit | fe3e4557471dfc7e617884186f41e18d6b4023c1 (patch) | |
tree | bad71dde7a9dad7cc4f97e1e8b8b9ed656cf6b41 | |
parent | 0a26a42b237bada32165e61867a2bf4461c5fab2 (diff) | |
download | gcc-fe3e4557471dfc7e617884186f41e18d6b4023c1.zip gcc-fe3e4557471dfc7e617884186f41e18d6b4023c1.tar.gz gcc-fe3e4557471dfc7e617884186f41e18d6b4023c1.tar.bz2 |
[1/2] aarch64: Reimplement (R){ADD,SUB}HN intrinsics with RTL codes
We can implement the halving-narrowing add/sub patterns with standard RTL codes as well rather than relying on unspecs.
This patch handles the low-part ones and the second patch does the high-part ones and removes the unspecs themselves.
The operation ADDHN on V4SI, for example, is represented as (truncate:V4HI ((src1:V4SI + src2:V4SI) >> 16))
and RADDHN as (truncate:V4HI ((src1:V4SI + src2:V4SI + (1 << 15)) >> 16)).
Taking this opportunity I specified the patterns returning the narrow mode and annotated them with the
<vczle><vczbe> define_subst rules to get the vec_concat-zero meta-patterns too. This allows us to simplify
the expanders somewhat too. Tests are added to check that the combinations work.
Bootstrapped and tested on aarch64-none-linux-gnu. Also tested on aarch64_be-none-elf.
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md (aarch64_<sur><addsub>hn<mode>_insn_le):
Delete.
(aarch64_<optab>hn<mode>_insn<vczle><vczbe>): New define_insn.
(aarch64_<sur><addsub>hn<mode>_insn_be): Delete.
(aarch64_r<optab>hn<mode>_insn<vczle><vczbe>): New define_insn.
(aarch64_<sur><addsub>hn<mode>): Delete.
(aarch64_<optab>hn<mode>): New define_expand.
(aarch64_r<optab>hn<mode>): Likewise.
* config/aarch64/predicates.md (aarch64_simd_raddsubhn_imm_vec):
New predicate.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/pr99195_4.c: New test.
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 82 | ||||
-rw-r--r-- | gcc/config/aarch64/predicates.md | 6 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/simd/pr99195_4.c | 35 |
3 files changed, 88 insertions, 35 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 705c4b0..421173e7 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4955,49 +4955,61 @@ ;; <r><addsub>hn<q>. -(define_insn "aarch64_<sur><addsub>hn<mode>_insn_le" - [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w") - (vec_concat:<VNARROWQ2> - (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w") - (match_operand:VQN 2 "register_operand" "w")] - ADDSUBHN) - (match_operand:<VNARROWQ> 3 "aarch64_simd_or_scalar_imm_zero")))] - "TARGET_SIMD && !BYTES_BIG_ENDIAN" - "<sur><addsub>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>" - [(set_attr "type" "neon_<addsub>_halve_narrow_q")] +(define_insn "aarch64_<optab>hn<mode>_insn<vczle><vczbe>" + [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w") + (truncate:<VNARROWQ> + (ashiftrt:VQN + (ADDSUB:VQN (match_operand:VQN 1 "register_operand" "w") + (match_operand:VQN 2 "register_operand" "w")) + (match_operand:VQN 3 "aarch64_simd_shift_imm_vec_exact_top"))))] + "TARGET_SIMD" + "<optab>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>" + [(set_attr "type" "neon_<optab>_halve_narrow_q")] ) -(define_insn "aarch64_<sur><addsub>hn<mode>_insn_be" - [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w") - (vec_concat:<VNARROWQ2> - (match_operand:<VNARROWQ> 3 "aarch64_simd_or_scalar_imm_zero") - (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w") - (match_operand:VQN 2 "register_operand" "w")] - ADDSUBHN)))] - "TARGET_SIMD && BYTES_BIG_ENDIAN" - "<sur><addsub>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>" - [(set_attr "type" "neon_<addsub>_halve_narrow_q")] +(define_insn "aarch64_r<optab>hn<mode>_insn<vczle><vczbe>" + [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w") + (truncate:<VNARROWQ> + (ashiftrt:VQN + (plus:VQN + (ADDSUB:VQN (match_operand:VQN 1 "register_operand" "w") + (match_operand:VQN 2 "register_operand" "w")) + (match_operand:VQN 3 "aarch64_simd_raddsubhn_imm_vec")) + (match_operand:VQN 4 "aarch64_simd_shift_imm_vec_exact_top"))))] + "TARGET_SIMD" + "r<optab>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>" + [(set_attr "type" "neon_<optab>_halve_narrow_q")] ) -(define_expand "aarch64_<sur><addsub>hn<mode>" +(define_expand "aarch64_<optab>hn<mode>" [(set (match_operand:<VNARROWQ> 0 "register_operand") - (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand") - (match_operand:VQN 2 "register_operand")] - ADDSUBHN))] + (ADDSUB:VQN (match_operand:VQN 1 "register_operand") + (match_operand:VQN 2 "register_operand")))] "TARGET_SIMD" { - rtx tmp = gen_reg_rtx (<VNARROWQ2>mode); - if (BYTES_BIG_ENDIAN) - emit_insn (gen_aarch64_<sur><addsub>hn<mode>_insn_be (tmp, operands[1], - operands[2], CONST0_RTX (<VNARROWQ>mode))); - else - emit_insn (gen_aarch64_<sur><addsub>hn<mode>_insn_le (tmp, operands[1], - operands[2], CONST0_RTX (<VNARROWQ>mode))); + rtx shft + = aarch64_simd_gen_const_vector_dup (<MODE>mode, + GET_MODE_UNIT_BITSIZE (<MODE>mode) / 2); + emit_insn (gen_aarch64_<optab>hn<mode>_insn (operands[0], operands[1], + operands[2], shft)); + DONE; + } +) - /* The intrinsic expects a narrow result, so emit a subreg that will get - optimized away as appropriate. */ - emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp, - <VNARROWQ2>mode)); +(define_expand "aarch64_r<optab>hn<mode>" + [(set (match_operand:<VNARROWQ> 0 "register_operand") + (ADDSUB:VQN (match_operand:VQN 1 "register_operand") + (match_operand:VQN 2 "register_operand")))] + "TARGET_SIMD" + { + rtx shft + = aarch64_simd_gen_const_vector_dup (<MODE>mode, + GET_MODE_UNIT_BITSIZE (<MODE>mode) / 2); + rtx rnd + = aarch64_simd_gen_const_vector_dup (<MODE>mode, + HOST_WIDE_INT_1U << (GET_MODE_UNIT_BITSIZE (<MODE>mode) / 2 - 1)); + emit_insn (gen_aarch64_r<optab>hn<mode>_insn (operands[0], operands[1], + operands[2], rnd, shft)); DONE; } ) diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index 242f10a..73f7ade 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -593,6 +593,12 @@ HOST_WIDE_INT_1U << (GET_MODE_UNIT_BITSIZE (mode) - 1))"))) +(define_predicate "aarch64_simd_raddsubhn_imm_vec" + (and (match_code "const_vector") + (match_test "aarch64_const_vec_all_same_in_range_p (op, 1, + HOST_WIDE_INT_1U + << (GET_MODE_UNIT_BITSIZE (mode) / 2 - 1))"))) + (define_predicate "aarch64_simd_shift_imm_bitsize_qi" (and (match_code "const_int") (match_test "IN_RANGE (INTVAL (op), 0, 8)"))) diff --git a/gcc/testsuite/gcc.target/aarch64/simd/pr99195_4.c b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_4.c new file mode 100644 index 0000000..b6ef15b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_4.c @@ -0,0 +1,35 @@ +/* PR target/99195. */ +/* Check that we take advantage of 64-bit Advanced SIMD operations clearing + the top half of the vector register and no explicit zeroing instructions + are emitted. */ +/* { dg-do compile } */ +/* { dg-options "-O" } */ + +#include <arm_neon.h> + +#define MYOP(OT,IT,IMT,OP,IS,OS) \ +OT \ +foo_##OP##_##OS (IT a, IT b) \ +{ \ + IMT zeros = vcreate_##OS (0); \ + return vcombine_##OS (v##OP##_##IS (a, b), zeros); \ +} + + +#define FUNC(OT,IT,IMT,IS,OS) \ +MYOP (OT, IT, IMT, addhn, IS, OS) \ +MYOP (OT, IT, IMT, subhn, IS, OS) \ +MYOP (OT, IT, IMT, raddhn, IS, OS) \ +MYOP (OT, IT, IMT, rsubhn, IS, OS) + +FUNC (int8x16_t, int16x8_t, int8x8_t, s16, s8) +FUNC (int16x8_t, int32x4_t, int16x4_t, s32, s16) +FUNC (int32x4_t, int64x2_t, int32x2_t, s64, s32) + +FUNC (uint8x16_t, uint16x8_t, uint8x8_t, u16, u8) +FUNC (uint16x8_t, uint32x4_t, uint16x4_t, u32, u16) +FUNC (uint32x4_t, uint64x2_t, uint32x2_t, u64, u32) + +/* { dg-final { scan-assembler-not {\tfmov\t} } } */ +/* { dg-final { scan-assembler-not {\tmov\t} } } */ + |