diff options
author | Jonathan Wright <jonathan.wright@arm.com> | 2021-07-08 23:27:54 +0100 |
---|---|---|
committer | Jonathan Wright <jonathan.wright@arm.com> | 2021-07-23 12:14:42 +0100 |
commit | f2f04d8b9d1f5d4fc8c3a17c7fa5ac518574f2df (patch) | |
tree | bb0665a0de5b6d44b61b3872eeae320c71a156af | |
parent | 5f65676eba16f38e5e22122e6885c0bd8e504276 (diff) | |
download | gcc-f2f04d8b9d1f5d4fc8c3a17c7fa5ac518574f2df.zip gcc-f2f04d8b9d1f5d4fc8c3a17c7fa5ac518574f2df.tar.gz gcc-f2f04d8b9d1f5d4fc8c3a17c7fa5ac518574f2df.tar.bz2 |
aarch64: Use memcpy to copy vector tables in vtbl[34] intrinsics
Use __builtin_memcpy to copy vector structures instead of building
a new opaque structure one vector at a time in each of the vtbl[34]
Neon intrinsics in arm_neon.h. This simplifies the header file and
also improves code generation - superfluous move instructions were
emitted for every register extraction/set in this additional
structure.
gcc/ChangeLog:
2021-07-08 Jonathan Wright <jonathan.wright@arm.com>
* config/aarch64/arm_neon.h (vtbl3_s8): Use __builtin_memcpy
instead of constructing __builtin_aarch64_simd_oi one vector
at a time.
(vtbl3_u8): Likewise.
(vtbl3_p8): Likewise.
(vtbl4_s8): Likewise.
(vtbl4_u8): Likewise.
(vtbl4_p8): Likewise.
-rw-r--r-- | gcc/config/aarch64/arm_neon.h | 39 |
1 files changed, 12 insertions, 27 deletions
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index a7b8449..0ec46ef 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -9682,11 +9682,9 @@ vtbl3_s8 (int8x8x3_t __tab, int8x8_t __idx) int8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]); - __temp.val[1] = vcombine_s8 (__tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __temp.val[1] = vcombine_s8 (__tab.val[2], + vcreate_s8 (__AARCH64_UINT64_C (0x0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return __builtin_aarch64_qtbl2v8qi (__o, __idx); } @@ -9697,11 +9695,9 @@ vtbl3_u8 (uint8x8x3_t __tab, uint8x8_t __idx) uint8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]); - __temp.val[1] = vcombine_u8 (__tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __temp.val[1] = vcombine_u8 (__tab.val[2], + vcreate_u8 (__AARCH64_UINT64_C (0x0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return (uint8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } @@ -9712,11 +9708,9 @@ vtbl3_p8 (poly8x8x3_t __tab, uint8x8_t __idx) poly8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]); - __temp.val[1] = vcombine_p8 (__tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __temp.val[1] = vcombine_p8 (__tab.val[2], + vcreate_p8 (__AARCH64_UINT64_C (0x0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return (poly8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } @@ -9728,10 +9722,7 @@ vtbl4_s8 (int8x8x4_t __tab, int8x8_t __idx) __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]); __temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return __builtin_aarch64_qtbl2v8qi (__o, __idx); } @@ -9743,10 +9734,7 @@ vtbl4_u8 (uint8x8x4_t __tab, uint8x8_t __idx) __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]); __temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return (uint8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } @@ -9758,10 +9746,7 @@ vtbl4_p8 (poly8x8x4_t __tab, uint8x8_t __idx) __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]); __temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return(poly8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } |