aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Wright <jonathan.wright@arm.com>2021-07-08 23:27:54 +0100
committerJonathan Wright <jonathan.wright@arm.com>2021-07-23 12:14:42 +0100
commitf2f04d8b9d1f5d4fc8c3a17c7fa5ac518574f2df (patch)
treebb0665a0de5b6d44b61b3872eeae320c71a156af
parent5f65676eba16f38e5e22122e6885c0bd8e504276 (diff)
downloadgcc-f2f04d8b9d1f5d4fc8c3a17c7fa5ac518574f2df.zip
gcc-f2f04d8b9d1f5d4fc8c3a17c7fa5ac518574f2df.tar.gz
gcc-f2f04d8b9d1f5d4fc8c3a17c7fa5ac518574f2df.tar.bz2
aarch64: Use memcpy to copy vector tables in vtbl[34] intrinsics
Use __builtin_memcpy to copy vector structures instead of building a new opaque structure one vector at a time in each of the vtbl[34] Neon intrinsics in arm_neon.h. This simplifies the header file and also improves code generation - superfluous move instructions were emitted for every register extraction/set in this additional structure. gcc/ChangeLog: 2021-07-08 Jonathan Wright <jonathan.wright@arm.com> * config/aarch64/arm_neon.h (vtbl3_s8): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_oi one vector at a time. (vtbl3_u8): Likewise. (vtbl3_p8): Likewise. (vtbl4_s8): Likewise. (vtbl4_u8): Likewise. (vtbl4_p8): Likewise.
-rw-r--r--gcc/config/aarch64/arm_neon.h39
1 files changed, 12 insertions, 27 deletions
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index a7b8449..0ec46ef 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -9682,11 +9682,9 @@ vtbl3_s8 (int8x8x3_t __tab, int8x8_t __idx)
int8x16x2_t __temp;
__builtin_aarch64_simd_oi __o;
__temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]);
- __temp.val[1] = vcombine_s8 (__tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0)));
- __o = __builtin_aarch64_set_qregoiv16qi (__o,
- (int8x16_t) __temp.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv16qi (__o,
- (int8x16_t) __temp.val[1], 1);
+ __temp.val[1] = vcombine_s8 (__tab.val[2],
+ vcreate_s8 (__AARCH64_UINT64_C (0x0)));
+ __builtin_memcpy (&__o, &__temp, sizeof (__temp));
return __builtin_aarch64_qtbl2v8qi (__o, __idx);
}
@@ -9697,11 +9695,9 @@ vtbl3_u8 (uint8x8x3_t __tab, uint8x8_t __idx)
uint8x16x2_t __temp;
__builtin_aarch64_simd_oi __o;
__temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]);
- __temp.val[1] = vcombine_u8 (__tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0)));
- __o = __builtin_aarch64_set_qregoiv16qi (__o,
- (int8x16_t) __temp.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv16qi (__o,
- (int8x16_t) __temp.val[1], 1);
+ __temp.val[1] = vcombine_u8 (__tab.val[2],
+ vcreate_u8 (__AARCH64_UINT64_C (0x0)));
+ __builtin_memcpy (&__o, &__temp, sizeof (__temp));
return (uint8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx);
}
@@ -9712,11 +9708,9 @@ vtbl3_p8 (poly8x8x3_t __tab, uint8x8_t __idx)
poly8x16x2_t __temp;
__builtin_aarch64_simd_oi __o;
__temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]);
- __temp.val[1] = vcombine_p8 (__tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0)));
- __o = __builtin_aarch64_set_qregoiv16qi (__o,
- (int8x16_t) __temp.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv16qi (__o,
- (int8x16_t) __temp.val[1], 1);
+ __temp.val[1] = vcombine_p8 (__tab.val[2],
+ vcreate_p8 (__AARCH64_UINT64_C (0x0)));
+ __builtin_memcpy (&__o, &__temp, sizeof (__temp));
return (poly8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx);
}
@@ -9728,10 +9722,7 @@ vtbl4_s8 (int8x8x4_t __tab, int8x8_t __idx)
__builtin_aarch64_simd_oi __o;
__temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]);
__temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]);
- __o = __builtin_aarch64_set_qregoiv16qi (__o,
- (int8x16_t) __temp.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv16qi (__o,
- (int8x16_t) __temp.val[1], 1);
+ __builtin_memcpy (&__o, &__temp, sizeof (__temp));
return __builtin_aarch64_qtbl2v8qi (__o, __idx);
}
@@ -9743,10 +9734,7 @@ vtbl4_u8 (uint8x8x4_t __tab, uint8x8_t __idx)
__builtin_aarch64_simd_oi __o;
__temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]);
__temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]);
- __o = __builtin_aarch64_set_qregoiv16qi (__o,
- (int8x16_t) __temp.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv16qi (__o,
- (int8x16_t) __temp.val[1], 1);
+ __builtin_memcpy (&__o, &__temp, sizeof (__temp));
return (uint8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx);
}
@@ -9758,10 +9746,7 @@ vtbl4_p8 (poly8x8x4_t __tab, uint8x8_t __idx)
__builtin_aarch64_simd_oi __o;
__temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]);
__temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]);
- __o = __builtin_aarch64_set_qregoiv16qi (__o,
- (int8x16_t) __temp.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv16qi (__o,
- (int8x16_t) __temp.val[1], 1);
+ __builtin_memcpy (&__o, &__temp, sizeof (__temp));
return(poly8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx);
}