aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Pinski <quic_apinski@quicinc.com>2024-11-14 19:03:20 -0800
committerAdhemerval Zanella <adhemerval.zanella@linaro.org>2024-11-21 11:32:23 -0300
commite6590f0c86632c36c9a784cf96075f4be2e920d2 (patch)
tree578f8d557305b23525bf0f95628eb7b6eb2d574f
parenteb5eeb47403e0a91de834868e501b4d62b8d2cb9 (diff)
downloadglibc-e6590f0c86632c36c9a784cf96075f4be2e920d2.zip
glibc-e6590f0c86632c36c9a784cf96075f4be2e920d2.tar.gz
glibc-e6590f0c86632c36c9a784cf96075f4be2e920d2.tar.bz2
aarch64: Remove non-temporal load/stores from oryon-1's memset
The hardware architects have a new recommendation not to use non-temporal load/stores for memset. This patch removes this path. I found there was no difference in the memset speed with/without non-temporal load/stores either. Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com> Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
-rw-r--r--sysdeps/aarch64/multiarch/memset_oryon1.S26
1 files changed, 0 insertions, 26 deletions
diff --git a/sysdeps/aarch64/multiarch/memset_oryon1.S b/sysdeps/aarch64/multiarch/memset_oryon1.S
index 6fa28a9..b63c16e 100644
--- a/sysdeps/aarch64/multiarch/memset_oryon1.S
+++ b/sysdeps/aarch64/multiarch/memset_oryon1.S
@@ -93,8 +93,6 @@ L(set_long):
cmp count, 256
ccmp valw, 0, 0, cs
b.eq L(try_zva)
- cmp count, #32768
- b.hi L(set_long_with_nontemp)
/* Small-size or non-zero memset does not use DC ZVA. */
sub count, dstend, dst
@@ -117,30 +115,6 @@ L(set_long):
stp val, val, [dstend, -16]
ret
-L(set_long_with_nontemp):
- /* Small-size or non-zero memset does not use DC ZVA. */
- sub count, dstend, dst
-
- /* Adjust count and bias for loop. By subtracting extra 1 from count,
- it is easy to use tbz instruction to check whether loop tailing
- count is less than 33 bytes, so as to bypass 2 unnecessary stps. */
- sub count, count, 64+16+1
-
-1: stnp val, val, [dst, 16]
- stnp val, val, [dst, 32]
- stnp val, val, [dst, 48]
- stnp val, val, [dst, 64]
- add dst, dst, #64
- subs count, count, 64
- b.hs 1b
-
- tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
- stnp val, val, [dst, 16]
- stnp val, val, [dst, 32]
-1: stnp val, val, [dstend, -32]
- stnp val, val, [dstend, -16]
- ret
-
L(try_zva):
/* Write the first and last 64 byte aligned block using stp rather
than using DC ZVA as it is faster. */