diff options
author | Jonathan Wright <jonathan.wright@arm.com> | 2021-07-06 16:20:02 +0100 |
---|---|---|
committer | Jonathan Wright <jonathan.wright@arm.com> | 2021-07-23 12:13:55 +0100 |
commit | 484acfa4cfe9385d7b78919ca9eb2047ded8f078 (patch) | |
tree | 9822e5a0e0c4374c6ccebcb2d87367766222d769 /gcc | |
parent | 5b965dc49a6a4293ce85bc3a24ca3f3855469e68 (diff) | |
download | gcc-484acfa4cfe9385d7b78919ca9eb2047ded8f078.zip gcc-484acfa4cfe9385d7b78919ca9eb2047ded8f078.tar.gz gcc-484acfa4cfe9385d7b78919ca9eb2047ded8f078.tar.bz2 |
aarch64: Use memcpy to copy vector tables in vqtbl[234] intrinsics
Use __builtin_memcpy to copy vector structures instead of building
a new opaque structure one vector at a time in each of the vqtbl[234]
Neon intrinsics in arm_neon.h. This simplifies the header file and
also improves code generation - superfluous move instructions were
emitted for every register extraction/set in this additional
structure.
Add new code generation tests to verify that superfluous move
instructions are no longer generated for the vqtbl[234] intrinsics.
gcc/ChangeLog:
2021-07-08 Jonathan Wright <jonathan.wright@arm.com>
* config/aarch64/arm_neon.h (vqtbl2_s8): Use __builtin_memcpy
instead of constructing __builtin_aarch64_simd_oi one vector
at a time.
(vqtbl2_u8): Likewise.
(vqtbl2_p8): Likewise.
(vqtbl2q_s8): Likewise.
(vqtbl2q_u8): Likewise.
(vqtbl2q_p8): Likewise.
(vqtbl3_s8): Use __builtin_memcpy instead of constructing
__builtin_aarch64_simd_ci one vector at a time.
(vqtbl3_u8): Likewise.
(vqtbl3_p8): Likewise.
(vqtbl3q_s8): Likewise.
(vqtbl3q_u8): Likewise.
(vqtbl3q_p8): Likewise.
(vqtbl4_s8): Use __builtin_memcpy instead of constructing
__builtin_aarch64_simd_xi one vector at a time.
(vqtbl4_u8): Likewise.
(vqtbl4_p8): Likewise.
(vqtbl4q_s8): Likewise.
(vqtbl4q_u8): Likewise.
(vqtbl4q_p8): Likewise.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/vector_structure_intrinsics.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/aarch64/arm_neon.h | 72 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c | 44 |
2 files changed, 62 insertions, 54 deletions
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 1048d7c..31ae86e 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -23321,8 +23321,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl2_s8 (int8x16x2_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } @@ -23331,8 +23330,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl2_u8 (uint8x16x2_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } @@ -23341,8 +23339,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl2_p8 (poly8x16x2_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } @@ -23351,8 +23348,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl2q_s8 (int8x16x2_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbl2v16qi (__o, (int8x16_t)__idx); } @@ -23361,8 +23357,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl2q_u8 (uint8x16x2_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x16_t)__builtin_aarch64_qtbl2v16qi (__o, (int8x16_t)__idx); } @@ -23371,8 +23366,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl2q_p8 (poly8x16x2_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x16_t)__builtin_aarch64_qtbl2v16qi (__o, (int8x16_t)__idx); } @@ -23383,9 +23377,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl3_s8 (int8x16x3_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx); } @@ -23394,9 +23386,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl3_u8 (uint8x16x3_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx); } @@ -23405,9 +23395,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl3_p8 (poly8x16x3_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx); } @@ -23416,9 +23404,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl3q_s8 (int8x16x3_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx); } @@ -23427,9 +23413,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl3q_u8 (uint8x16x3_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx); } @@ -23438,9 +23422,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl3q_p8 (poly8x16x3_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx); } @@ -23451,10 +23433,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl4_s8 (int8x16x4_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx); } @@ -23463,10 +23442,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl4_u8 (uint8x16x4_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx); } @@ -23475,10 +23451,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl4_p8 (poly8x16x4_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx); } @@ -23487,10 +23460,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl4q_s8 (int8x16x4_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx); } @@ -23499,10 +23469,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl4q_u8 (uint8x16x4_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx); } @@ -23511,10 +23478,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl4q_p8 (poly8x16x4_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx); } diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c new file mode 100644 index 0000000..0b07e9e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c @@ -0,0 +1,44 @@ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ + +#include <arm_neon.h> + +#define TEST_TBL(name, rettype, tbltype, idxtype, ts) \ + rettype test_ ## name ## _ ## ts (tbltype a, idxtype b) \ + { \ + return name ## _ ## ts (a, b); \ + } + +TEST_TBL (vqtbl2, int8x8_t, int8x16x2_t, uint8x8_t, s8) +TEST_TBL (vqtbl2, uint8x8_t, uint8x16x2_t, uint8x8_t, u8) +TEST_TBL (vqtbl2, poly8x8_t, poly8x16x2_t, uint8x8_t, p8) + +TEST_TBL (vqtbl2q, int8x16_t, int8x16x2_t, uint8x16_t, s8) +TEST_TBL (vqtbl2q, uint8x16_t, uint8x16x2_t, uint8x16_t, u8) +TEST_TBL (vqtbl2q, poly8x16_t, poly8x16x2_t, uint8x16_t, p8) + +TEST_TBL (vqtbl4, int8x8_t, int8x16x4_t, uint8x8_t, s8) +TEST_TBL (vqtbl4, uint8x8_t, uint8x16x4_t, uint8x8_t, u8) +TEST_TBL (vqtbl4, poly8x8_t, poly8x16x4_t, uint8x8_t, p8) + +TEST_TBL (vqtbl4q, int8x16_t, int8x16x4_t, uint8x16_t, s8) +TEST_TBL (vqtbl4q, uint8x16_t, uint8x16x4_t, uint8x16_t, u8) +TEST_TBL (vqtbl4q, poly8x16_t, poly8x16x4_t, uint8x16_t, p8) + +#define TEST_TBL3(name, rettype, tbltype, idxtype, ts) \ + rettype test_ ## name ## _ ## ts (idxtype a, tbltype b) \ + { \ + return name ## _ ## ts (b, a); \ + } + +TEST_TBL3 (vqtbl3, int8x8_t, int8x16x3_t, uint8x8_t, s8) +TEST_TBL3 (vqtbl3, uint8x8_t, uint8x16x3_t, uint8x8_t, u8) +TEST_TBL3 (vqtbl3, poly8x8_t, poly8x16x3_t, uint8x8_t, p8) + +TEST_TBL3 (vqtbl3q, int8x16_t, int8x16x3_t, uint8x16_t, s8) +TEST_TBL3 (vqtbl3q, uint8x16_t, uint8x16x3_t, uint8x16_t, u8) +TEST_TBL3 (vqtbl3q, poly8x16_t, poly8x16x3_t, uint8x16_t, p8) + +/* { dg-final { scan-assembler-not "mov\\t" } } */ + +/* { dg-final { scan-assembler-times "tbl\\t" 18} } */ |