diff options
author | Andrew Pinski <quic_apinski@quicinc.com> | 2024-11-14 19:03:20 -0800 |
---|---|---|
committer | Adhemerval Zanella <adhemerval.zanella@linaro.org> | 2024-11-21 11:32:23 -0300 |
commit | e6590f0c86632c36c9a784cf96075f4be2e920d2 (patch) | |
tree | 578f8d557305b23525bf0f95628eb7b6eb2d574f | |
parent | eb5eeb47403e0a91de834868e501b4d62b8d2cb9 (diff) | |
download | glibc-e6590f0c86632c36c9a784cf96075f4be2e920d2.zip glibc-e6590f0c86632c36c9a784cf96075f4be2e920d2.tar.gz glibc-e6590f0c86632c36c9a784cf96075f4be2e920d2.tar.bz2 |
aarch64: Remove non-temporal load/stores from oryon-1's memset
The hardware architects have a new recommendation not to use
non-temporal load/stores for memset. This patch removes this path.
I found there was no difference in the memset speed with/without
non-temporal load/stores either.
Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
-rw-r--r-- | sysdeps/aarch64/multiarch/memset_oryon1.S | 26 |
1 files changed, 0 insertions, 26 deletions
diff --git a/sysdeps/aarch64/multiarch/memset_oryon1.S b/sysdeps/aarch64/multiarch/memset_oryon1.S index 6fa28a9..b63c16e 100644 --- a/sysdeps/aarch64/multiarch/memset_oryon1.S +++ b/sysdeps/aarch64/multiarch/memset_oryon1.S @@ -93,8 +93,6 @@ L(set_long): cmp count, 256 ccmp valw, 0, 0, cs b.eq L(try_zva) - cmp count, #32768 - b.hi L(set_long_with_nontemp) /* Small-size or non-zero memset does not use DC ZVA. */ sub count, dstend, dst @@ -117,30 +115,6 @@ L(set_long): stp val, val, [dstend, -16] ret -L(set_long_with_nontemp): - /* Small-size or non-zero memset does not use DC ZVA. */ - sub count, dstend, dst - - /* Adjust count and bias for loop. By subtracting extra 1 from count, - it is easy to use tbz instruction to check whether loop tailing - count is less than 33 bytes, so as to bypass 2 unnecessary stps. */ - sub count, count, 64+16+1 - -1: stnp val, val, [dst, 16] - stnp val, val, [dst, 32] - stnp val, val, [dst, 48] - stnp val, val, [dst, 64] - add dst, dst, #64 - subs count, count, 64 - b.hs 1b - - tbz count, 5, 1f /* Remaining count is less than 33 bytes? */ - stnp val, val, [dst, 16] - stnp val, val, [dst, 32] -1: stnp val, val, [dstend, -32] - stnp val, val, [dstend, -16] - ret - L(try_zva): /* Write the first and last 64 byte aligned block using stp rather than using DC ZVA as it is faster. */ |