diff options
author | Andrew Pinski <quic_apinski@quicinc.com> | 2024-11-14 19:03:19 -0800 |
---|---|---|
committer | Adhemerval Zanella <adhemerval.zanella@linaro.org> | 2024-11-21 11:32:17 -0300 |
commit | eb5eeb47403e0a91de834868e501b4d62b8d2cb9 (patch) | |
tree | 489697c8afed5d6510ad9e8f2399d0c93772e5b8 | |
parent | 3051f3495cad507afebe2f654d32b51629554e3e (diff) | |
download | glibc-eb5eeb47403e0a91de834868e501b4d62b8d2cb9.zip glibc-eb5eeb47403e0a91de834868e501b4d62b8d2cb9.tar.gz glibc-eb5eeb47403e0a91de834868e501b4d62b8d2cb9.tar.bz2 |
aarch64: Remove non-temporal load/stores from oryon-1's memcpy
The hardware architects have a new recommendation not to use
non-temporal load/stores for memcpy. This patch removes this path.
I found there was no difference in the memcpy speed with/without
non-temporal load/stores either.
Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
-rw-r--r-- | sysdeps/aarch64/multiarch/memcpy_oryon1.S | 40 |
1 files changed, 0 insertions, 40 deletions
diff --git a/sysdeps/aarch64/multiarch/memcpy_oryon1.S b/sysdeps/aarch64/multiarch/memcpy_oryon1.S index 4efc43d..6cae97d 100644 --- a/sysdeps/aarch64/multiarch/memcpy_oryon1.S +++ b/sysdeps/aarch64/multiarch/memcpy_oryon1.S @@ -160,46 +160,6 @@ L(copy96): .p2align 6 L(copy_long): - /* On oryon1 cores, large memcpy's are helped by using ldnp/stnp. - This loop is identical to the one below it but using ldnp/stnp - instructions. For loops that are less than 32768 bytes, - the ldnp/stnp instructions will not help and will cause a slow - down so only use the ldnp/stnp loop for the largest sizes. */ - - cmp count, #32768 - b.lo L(copy_long_without_nontemp) - and tmp1, dstin, 15 - bic dst, dstin, 15 - ldnp D_l, D_h, [src] - sub src, src, tmp1 - add count, count, tmp1 /* Count is now 16 too large. */ - ldnp A_l, A_h, [src, 16] - stnp D_l, D_h, [dstin] - ldnp B_l, B_h, [src, 32] - ldnp C_l, C_h, [src, 48] - ldnp D_l, D_h, [src, 64] - add src, src, #64 - subs count, count, 128 + 16 /* Test and readjust count. */ - -L(nontemp_loop64): - tbz src, #6, 1f -1: - stnp A_l, A_h, [dst, 16] - ldnp A_l, A_h, [src, 16] - stnp B_l, B_h, [dst, 32] - ldnp B_l, B_h, [src, 32] - stnp C_l, C_h, [dst, 48] - ldnp C_l, C_h, [src, 48] - stnp D_l, D_h, [dst, 64] - ldnp D_l, D_h, [src, 64] - add src, src, #64 - add dst, dst, #64 - subs count, count, 64 - b.hi L(nontemp_loop64) - b L(last64) - -L(copy_long_without_nontemp): - and tmp1, dstin, 15 bic dst, dstin, 15 ldp D_l, D_h, [src] |