aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/aarch64
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/aarch64')
-rw-r--r--sysdeps/aarch64/multiarch/memset_oryon1.S26
1 files changed, 26 insertions, 0 deletions
diff --git a/sysdeps/aarch64/multiarch/memset_oryon1.S b/sysdeps/aarch64/multiarch/memset_oryon1.S
index 0f9b718..88f4ef4 100644
--- a/sysdeps/aarch64/multiarch/memset_oryon1.S
+++ b/sysdeps/aarch64/multiarch/memset_oryon1.S
@@ -90,6 +90,8 @@ L(set_long):
cmp count, 256
ccmp valw, 0, 0, cs
b.eq L(try_zva)
+ cmp count, #32768
+ b.hi L(set_long_with_nontemp)
/* Small-size or non-zero memset does not use DC ZVA. */
sub count, dstend, dst
@@ -112,6 +114,30 @@ L(set_long):
stp val, val, [dstend, -16]
ret
+L(set_long_with_nontemp):
+ /* Small-size or non-zero memset does not use DC ZVA. */
+ sub count, dstend, dst
+
+ /* Adjust count and bias for loop. By subtracting extra 1 from count,
+ it is easy to use tbz instruction to check whether loop tailing
+ count is less than 33 bytes, so as to bypass 2 unnecessary stps. */
+ sub count, count, 64+16+1
+
+1: stnp val, val, [dst, 16]
+ stnp val, val, [dst, 32]
+ stnp val, val, [dst, 48]
+ stnp val, val, [dst, 64]
+ add dst, dst, #64
+ subs count, count, 64
+ b.hs 1b
+
+ tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
+ stnp val, val, [dst, 16]
+ stnp val, val, [dst, 32]
+1: stnp val, val, [dstend, -32]
+ stnp val, val, [dstend, -16]
+ ret
+
L(try_zva):
/* Write the first and last 64 byte aligned block using stp rather
than using DC ZVA as it is faster. */