diff options
author | Jonathan Wright <jonathan.wright@arm.com> | 2021-07-08 23:27:54 +0100 |
---|---|---|
committer | Jonathan Wright <jonathan.wright@arm.com> | 2021-07-23 12:15:02 +0100 |
commit | 4848e283ccaed451ddcc38edcb9f5ce9e9f2d7eb (patch) | |
tree | 4ae2831d01b01fc7c3e1ec6865dd55da75e2e8e7 /gcc | |
parent | f2f04d8b9d1f5d4fc8c3a17c7fa5ac518574f2df (diff) | |
download | gcc-4848e283ccaed451ddcc38edcb9f5ce9e9f2d7eb.zip gcc-4848e283ccaed451ddcc38edcb9f5ce9e9f2d7eb.tar.gz gcc-4848e283ccaed451ddcc38edcb9f5ce9e9f2d7eb.tar.bz2 |
aarch64: Use memcpy to copy vector tables in vtbx4 intrinsics
Use __builtin_memcpy to copy vector structures instead of building
a new opaque structure one vector at a time in each of the vtbx4
Neon intrinsics in arm_neon.h. This simplifies the header file and
also improves code generation - superfluous move instructions were
emitted for every register extraction/set in this additional
structure.
gcc/ChangeLog:
2021-07-19 Jonathan Wright <jonathan.wright@arm.com>
* config/aarch64/arm_neon.h (vtbx4_s8): Use __builtin_memcpy
instead of constructing __builtin_aarch64_simd_oi one vector
at a time.
(vtbx4_u8): Likewise.
(vtbx4_p8): Likewise.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/aarch64/arm_neon.h | 15 |
1 files changed, 3 insertions, 12 deletions
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 0ec46ef..d383af3 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -28417,10 +28417,7 @@ vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx) __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]); __temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return __builtin_aarch64_qtbx2v8qi (__r, __o, __idx); } @@ -28432,10 +28429,7 @@ vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx) __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]); __temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return (uint8x8_t)__builtin_aarch64_qtbx2v8qi ((int8x8_t)__r, __o, (int8x8_t)__idx); } @@ -28448,10 +28442,7 @@ vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx) __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]); __temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return (poly8x8_t)__builtin_aarch64_qtbx2v8qi ((int8x8_t)__r, __o, (int8x8_t)__idx); } |