diff options
author | Kyrylo Tkachov <kyrylo.tkachov@arm.com> | 2023-05-04 09:42:37 +0100 |
---|---|---|
committer | Kyrylo Tkachov <kyrylo.tkachov@arm.com> | 2023-05-04 09:43:17 +0100 |
commit | 93c26deab98fc80b616a1c53c324a88f61036f53 (patch) | |
tree | 0465b097cafe1a4bce8e0fac662bcdbd0828a2ac /gcc | |
parent | d840bc5cab39aa3dd8222d72b2cd40942bf91c93 (diff) | |
download | gcc-93c26deab98fc80b616a1c53c324a88f61036f53.zip gcc-93c26deab98fc80b616a1c53c324a88f61036f53.tar.gz gcc-93c26deab98fc80b616a1c53c324a88f61036f53.tar.bz2 |
aarch64: PR target/99195 annotate simple ternary ops for vec-concat with zero
We're now moving onto various simple ternary instructions, including some lane forms.
These include intrinsics that map down to mla, mls, fma, aba, bsl instructions.
Tests are added for lane 0 and lane 1 as for some of these instructions the lane 0 variants
use separate simpler patterns that need a separate annotation.
Bootstrapped and tested on aarch64-none-linux-gnu.
gcc/ChangeLog:
PR target/99195
* config/aarch64/aarch64-simd.md (aarch64_<su>aba<mode>): Rename to...
(aarch64_<su>aba<mode><vczle><vczbe>): ... This.
(aarch64_mla<mode>): Rename to...
(aarch64_mla<mode><vczle><vczbe>): ... This.
(*aarch64_mla_elt<mode>): Rename to...
(*aarch64_mla_elt<mode><vczle><vczbe>): ... This.
(*aarch64_mla_elt_<vswap_width_name><mode>): Rename to...
(*aarch64_mla_elt_<vswap_width_name><mode><vczle><vczbe>): ... This.
(aarch64_mla_n<mode>): Rename to...
(aarch64_mla_n<mode><vczle><vczbe>): ... This.
(aarch64_mls<mode>): Rename to...
(aarch64_mls<mode><vczle><vczbe>): ... This.
(*aarch64_mls_elt<mode>): Rename to...
(*aarch64_mls_elt<mode><vczle><vczbe>): ... This.
(*aarch64_mls_elt_<vswap_width_name><mode>): Rename to...
(*aarch64_mls_elt_<vswap_width_name><mode><vczle><vczbe>): ... This.
(aarch64_mls_n<mode>): Rename to...
(aarch64_mls_n<mode><vczle><vczbe>): ... This.
(fma<mode>4): Rename to...
(fma<mode>4<vczle><vczbe>): ... This.
(*aarch64_fma4_elt<mode>): Rename to...
(*aarch64_fma4_elt<mode><vczle><vczbe>): ... This.
(*aarch64_fma4_elt_<vswap_width_name><mode>): Rename to...
(*aarch64_fma4_elt_<vswap_width_name><mode><vczle><vczbe>): ... This.
(*aarch64_fma4_elt_from_dup<mode>): Rename to...
(*aarch64_fma4_elt_from_dup<mode><vczle><vczbe>): ... This.
(fnma<mode>4): Rename to...
(fnma<mode>4<vczle><vczbe>): ... This.
(*aarch64_fnma4_elt<mode>): Rename to...
(*aarch64_fnma4_elt<mode><vczle><vczbe>): ... This.
(*aarch64_fnma4_elt_<vswap_width_name><mode>): Rename to...
(*aarch64_fnma4_elt_<vswap_width_name><mode><vczle><vczbe>): ... This.
(*aarch64_fnma4_elt_from_dup<mode>): Rename to...
(*aarch64_fnma4_elt_from_dup<mode><vczle><vczbe>): ... This.
(aarch64_simd_bsl<mode>_internal): Rename to...
(aarch64_simd_bsl<mode>_internal<vczle><vczbe>): ... This.
(*aarch64_simd_bsl<mode>_alt): Rename to...
(*aarch64_simd_bsl<mode>_alt<vczle><vczbe>): ... This.
gcc/testsuite/ChangeLog:
PR target/99195
* gcc.target/aarch64/simd/pr99195_3.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 38 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/simd/pr99195_3.c | 68 |
2 files changed, 87 insertions, 19 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 511d1e7..705c4b0 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1097,7 +1097,7 @@ } ) -(define_insn "aarch64_<su>aba<mode>" +(define_insn "aarch64_<su>aba<mode><vczle><vczbe>" [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w") (plus:VDQ_BHSI (minus:VDQ_BHSI (USMAX:VDQ_BHSI @@ -1551,7 +1551,7 @@ ) -(define_insn "aarch64_mla<mode>" +(define_insn "aarch64_mla<mode><vczle><vczbe>" [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w") (plus:VDQ_BHSI (mult:VDQ_BHSI (match_operand:VDQ_BHSI 2 "register_operand" "w") @@ -1562,7 +1562,7 @@ [(set_attr "type" "neon_mla_<Vetype><q>")] ) -(define_insn "*aarch64_mla_elt<mode>" +(define_insn "*aarch64_mla_elt<mode><vczle><vczbe>" [(set (match_operand:VDQHS 0 "register_operand" "=w") (plus:VDQHS (mult:VDQHS @@ -1580,7 +1580,7 @@ [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")] ) -(define_insn "*aarch64_mla_elt_<vswap_width_name><mode>" +(define_insn "*aarch64_mla_elt_<vswap_width_name><mode><vczle><vczbe>" [(set (match_operand:VDQHS 0 "register_operand" "=w") (plus:VDQHS (mult:VDQHS @@ -1598,7 +1598,7 @@ [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")] ) -(define_insn "aarch64_mla_n<mode>" +(define_insn "aarch64_mla_n<mode><vczle><vczbe>" [(set (match_operand:VDQHS 0 "register_operand" "=w") (plus:VDQHS (mult:VDQHS @@ -1611,7 +1611,7 @@ [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")] ) -(define_insn "aarch64_mls<mode>" +(define_insn "aarch64_mls<mode><vczle><vczbe>" [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w") (minus:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" "0") (mult:VDQ_BHSI (match_operand:VDQ_BHSI 2 "register_operand" "w") @@ -1621,7 +1621,7 @@ [(set_attr "type" "neon_mla_<Vetype><q>")] ) -(define_insn "*aarch64_mls_elt<mode>" +(define_insn "*aarch64_mls_elt<mode><vczle><vczbe>" [(set (match_operand:VDQHS 0 "register_operand" "=w") (minus:VDQHS (match_operand:VDQHS 4 "register_operand" "0") @@ -1639,7 +1639,7 @@ [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")] ) -(define_insn "*aarch64_mls_elt_<vswap_width_name><mode>" +(define_insn "*aarch64_mls_elt_<vswap_width_name><mode><vczle><vczbe>" [(set (match_operand:VDQHS 0 "register_operand" "=w") (minus:VDQHS (match_operand:VDQHS 4 "register_operand" "0") @@ -1657,7 +1657,7 @@ [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")] ) -(define_insn "aarch64_mls_n<mode>" +(define_insn "aarch64_mls_n<mode><vczle><vczbe>" [(set (match_operand:VDQHS 0 "register_operand" "=w") (minus:VDQHS (match_operand:VDQHS 1 "register_operand" "0") @@ -3077,7 +3077,7 @@ } ) -(define_insn "fma<mode>4" +(define_insn "fma<mode>4<vczle><vczbe>" [(set (match_operand:VHSDF 0 "register_operand" "=w") (fma:VHSDF (match_operand:VHSDF 1 "register_operand" "w") (match_operand:VHSDF 2 "register_operand" "w") @@ -3087,7 +3087,7 @@ [(set_attr "type" "neon_fp_mla_<stype><q>")] ) -(define_insn "*aarch64_fma4_elt<mode>" +(define_insn "*aarch64_fma4_elt<mode><vczle><vczbe>" [(set (match_operand:VDQF 0 "register_operand" "=w") (fma:VDQF (vec_duplicate:VDQF @@ -3104,7 +3104,7 @@ [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] ) -(define_insn "*aarch64_fma4_elt_<vswap_width_name><mode>" +(define_insn "*aarch64_fma4_elt_<vswap_width_name><mode><vczle><vczbe>" [(set (match_operand:VDQSF 0 "register_operand" "=w") (fma:VDQSF (vec_duplicate:VDQSF @@ -3121,7 +3121,7 @@ [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] ) -(define_insn "*aarch64_fma4_elt_from_dup<mode>" +(define_insn "*aarch64_fma4_elt_from_dup<mode><vczle><vczbe>" [(set (match_operand:VMUL 0 "register_operand" "=w") (fma:VMUL (vec_duplicate:VMUL @@ -3149,7 +3149,7 @@ [(set_attr "type" "neon_fp_mla_d_scalar_q")] ) -(define_insn "fnma<mode>4" +(define_insn "fnma<mode>4<vczle><vczbe>" [(set (match_operand:VHSDF 0 "register_operand" "=w") (fma:VHSDF (neg:VHSDF (match_operand:VHSDF 1 "register_operand" "w")) @@ -3160,7 +3160,7 @@ [(set_attr "type" "neon_fp_mla_<stype><q>")] ) -(define_insn "*aarch64_fnma4_elt<mode>" +(define_insn "*aarch64_fnma4_elt<mode><vczle><vczbe>" [(set (match_operand:VDQF 0 "register_operand" "=w") (fma:VDQF (neg:VDQF @@ -3178,7 +3178,7 @@ [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] ) -(define_insn "*aarch64_fnma4_elt_<vswap_width_name><mode>" +(define_insn "*aarch64_fnma4_elt_<vswap_width_name><mode><vczle><vczbe>" [(set (match_operand:VDQSF 0 "register_operand" "=w") (fma:VDQSF (neg:VDQSF @@ -3196,7 +3196,7 @@ [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] ) -(define_insn "*aarch64_fnma4_elt_from_dup<mode>" +(define_insn "*aarch64_fnma4_elt_from_dup<mode><vczle><vczbe>" [(set (match_operand:VMUL 0 "register_operand" "=w") (fma:VMUL (neg:VMUL @@ -3808,7 +3808,7 @@ ;; Some forms of straight-line code may generate the equivalent form ;; in *aarch64_simd_bsl<mode>_alt. -(define_insn "aarch64_simd_bsl<mode>_internal" +(define_insn "aarch64_simd_bsl<mode>_internal<vczle><vczbe>" [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w") (xor:VDQ_I (and:VDQ_I @@ -3832,7 +3832,7 @@ ;; the first. The two are equivalent but since recog doesn't try all ;; permutations of commutative operations, we have to have a separate pattern. -(define_insn "*aarch64_simd_bsl<mode>_alt" +(define_insn "*aarch64_simd_bsl<mode>_alt<vczle><vczbe>" [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w") (xor:VDQ_I (and:VDQ_I diff --git a/gcc/testsuite/gcc.target/aarch64/simd/pr99195_3.c b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_3.c new file mode 100644 index 0000000..c751924 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_3.c @@ -0,0 +1,68 @@ +/* PR target/99195. */ +/* Check that we take advantage of 64-bit Advanced SIMD operations clearing + the top half of the vector register and no explicit zeroing instructions + are emitted. */ +/* { dg-do compile } */ +/* { dg-options "-O" } */ + +#include <arm_neon.h> + +#define TERNARY(OT,IT,OP,S) \ +OT \ +foo_##OP##_##S (IT a, IT b, IT c) \ +{ \ + IT zeros = vcreate_##S (0); \ + return vcombine_##S (v##OP##_##S (a, b, c), zeros); \ +} + +#define FUNC(T,IS,OS,OP,S) TERNARY (T##x##OS##_t, T##x##IS##_t, OP, S) + +#define OPTWO(T,IS,OS,S,OP1,OP2) \ +FUNC (T, IS, OS, OP1, S) \ +FUNC (T, IS, OS, OP2, S) + +#define OPTHREE(T, IS, OS, S, OP1, OP2, OP3) \ +FUNC (T, IS, OS, OP1, S) \ +OPTWO (T, IS, OS, S, OP2, OP3) + +#define OPFOUR(T,IS,OS,S,OP1,OP2,OP3,OP4) \ +FUNC (T, IS, OS, OP1, S) \ +OPTHREE (T, IS, OS, S, OP2, OP3, OP4) + +OPTHREE (int8, 8, 16, s8, mla, mls, aba) +OPTHREE (int16, 4, 8, s16, mla, mls, aba) +OPTHREE (int32, 2, 4, s32, mla, mls, aba) + +OPFOUR (uint8, 8, 16, u8, mla, mls, aba, bsl) +OPFOUR (uint16, 4, 8, u16, mla, mls, aba, bsl) +OPFOUR (uint32, 2, 4, u32, mla, mls, aba, bsl) + +OPTHREE (float32, 2, 4, f32, mla, fma, fms) + +#undef FUNC +#define TERNARY_LANE(OT,IT,OP,S) \ +OT \ +foo_##OP##_##S (IT a, IT b, IT c) \ +{ \ + IT zeros = vcreate_##S (0); \ + return vcombine_##S (v##OP##_##S (a, b, c, 0), zeros); \ +} \ +OT \ +foo_##OP##_##S##_lane1 (IT a, IT b, IT c) \ +{ \ + IT zeros = vcreate_##S (0); \ + return vcombine_##S (v##OP##_##S (a, b, c, 1), zeros); \ +} + +#define FUNC(T,IS,OS,OP,S) TERNARY_LANE (T##x##OS##_t, T##x##IS##_t, OP, S) +OPTWO (int16, 4, 8, s16, mla_lane, mls_lane) +OPTWO (int32, 2, 4, s32, mla_lane, mls_lane) + +OPTWO (uint16, 4, 8, u16, mla_lane, mls_lane) +OPTWO (uint32, 2, 4, u32, mla_lane, mls_lane) + +OPTHREE (float32, 2, 4, f32, mla_lane, fma_lane, fms_lane) + +/* { dg-final { scan-assembler-not {\tfmov\t} } } */ +/* { dg-final { scan-assembler-not {\tmov\t} } } */ + |