aboutsummaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
authorAndrew Pinski <quic_apinski@quicinc.com>2025-02-21 15:13:53 -0800
committerAndrew Pinski <quic_apinski@quicinc.com>2025-04-15 12:07:07 -0700
commitceeffd970c56893885cbf8382ae34b015f177850 (patch)
treebf0bf7710378b77a78c4f9292e843af8c6e5336a /sysdeps
parent0e1aa5db738ac7c73599a3e7f1a0b70b99f99e0a (diff)
downloadglibc-ceeffd970c56893885cbf8382ae34b015f177850.zip
glibc-ceeffd970c56893885cbf8382ae34b015f177850.tar.gz
glibc-ceeffd970c56893885cbf8382ae34b015f177850.tar.bz2
aarch64: Add back non-temporal load/stores from oryon-1's memset
I misunderstood the recommendation from the hardware team about non-temporal load/stores. It is still recommended to use them in memset for large sizes. It was not recommended for their use with device memory and memset is already not valid to be used with device memory. This reverts commit e6590f0c86632c36c9a784cf96075f4be2e920d2. Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com> Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/aarch64/multiarch/memset_oryon1.S26
1 files changed, 26 insertions, 0 deletions
diff --git a/sysdeps/aarch64/multiarch/memset_oryon1.S b/sysdeps/aarch64/multiarch/memset_oryon1.S
index 0f9b718..88f4ef4 100644
--- a/sysdeps/aarch64/multiarch/memset_oryon1.S
+++ b/sysdeps/aarch64/multiarch/memset_oryon1.S
@@ -90,6 +90,8 @@ L(set_long):
cmp count, 256
ccmp valw, 0, 0, cs
b.eq L(try_zva)
+ cmp count, #32768
+ b.hi L(set_long_with_nontemp)
/* Small-size or non-zero memset does not use DC ZVA. */
sub count, dstend, dst
@@ -112,6 +114,30 @@ L(set_long):
stp val, val, [dstend, -16]
ret
+L(set_long_with_nontemp):
+ /* Small-size or non-zero memset does not use DC ZVA. */
+ sub count, dstend, dst
+
+ /* Adjust count and bias for loop. By subtracting extra 1 from count,
+ it is easy to use tbz instruction to check whether loop tailing
+ count is less than 33 bytes, so as to bypass 2 unnecessary stps. */
+ sub count, count, 64+16+1
+
+1: stnp val, val, [dst, 16]
+ stnp val, val, [dst, 32]
+ stnp val, val, [dst, 48]
+ stnp val, val, [dst, 64]
+ add dst, dst, #64
+ subs count, count, 64
+ b.hs 1b
+
+ tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
+ stnp val, val, [dst, 16]
+ stnp val, val, [dst, 32]
+1: stnp val, val, [dstend, -32]
+ stnp val, val, [dstend, -16]
+ ret
+
L(try_zva):
/* Write the first and last 64 byte aligned block using stp rather
than using DC ZVA as it is faster. */