diff options
Diffstat (limited to 'sysdeps/aarch64')
-rw-r--r-- | sysdeps/aarch64/multiarch/memset_oryon1.S | 26 |
1 files changed, 26 insertions, 0 deletions
diff --git a/sysdeps/aarch64/multiarch/memset_oryon1.S b/sysdeps/aarch64/multiarch/memset_oryon1.S index 0f9b718..88f4ef4 100644 --- a/sysdeps/aarch64/multiarch/memset_oryon1.S +++ b/sysdeps/aarch64/multiarch/memset_oryon1.S @@ -90,6 +90,8 @@ L(set_long): cmp count, 256 ccmp valw, 0, 0, cs b.eq L(try_zva) + cmp count, #32768 + b.hi L(set_long_with_nontemp) /* Small-size or non-zero memset does not use DC ZVA. */ sub count, dstend, dst @@ -112,6 +114,30 @@ L(set_long): stp val, val, [dstend, -16] ret +L(set_long_with_nontemp): + /* Small-size or non-zero memset does not use DC ZVA. */ + sub count, dstend, dst + + /* Adjust count and bias for loop. By subtracting extra 1 from count, + it is easy to use tbz instruction to check whether loop tailing + count is less than 33 bytes, so as to bypass 2 unnecessary stps. */ + sub count, count, 64+16+1 + +1: stnp val, val, [dst, 16] + stnp val, val, [dst, 32] + stnp val, val, [dst, 48] + stnp val, val, [dst, 64] + add dst, dst, #64 + subs count, count, 64 + b.hs 1b + + tbz count, 5, 1f /* Remaining count is less than 33 bytes? */ + stnp val, val, [dst, 16] + stnp val, val, [dst, 32] +1: stnp val, val, [dstend, -32] + stnp val, val, [dstend, -16] + ret + L(try_zva): /* Write the first and last 64 byte aligned block using stp rather than using DC ZVA as it is faster. */ |