aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/aarch64/strcpy.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/aarch64/strcpy.S')
-rw-r--r--sysdeps/aarch64/strcpy.S32
1 files changed, 12 insertions, 20 deletions
diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
index da53170..78d27b4 100644
--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
@@ -40,7 +40,6 @@
#define len x4
#define synd x4
#define tmp x5
-#define wtmp w5
#define shift x5
#define data1 x6
#define dataw1 w6
@@ -50,9 +49,8 @@
#define dataq q0
#define vdata v0
#define vhas_nul v1
-#define vrepmask v2
-#define vend v3
-#define dend d3
+#define vend v2
+#define dend d2
#define dataq2 q1
#ifdef BUILD_STPCPY
@@ -63,34 +61,29 @@
# define IFSTPCPY(X,...)
#endif
-/* Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+/*
+ Core algorithm:
+ For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+ per byte. We take 4 bits of every comparison byte with shift right and narrow
+ by 4 instruction. Since the bits in the nibble mask reflect the order in
+ which things occur in the original string, counting leading zeros identifies
+ exactly which byte matched. */
ENTRY (STRCPY)
PTR_ARG (0)
PTR_ARG (1)
bic src, srcin, 15
- mov wtmp, 0xf00f
ld1 {vdata.16b}, [src]
- dup vrepmask.8h, wtmp
cmeq vhas_nul.16b, vdata.16b, 0
lsl shift, srcin, 2
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
cbnz synd, L(tail)
ldr dataq, [src, 16]!
cmeq vhas_nul.16b, vdata.16b, 0
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
cbz synd, L(start_loop)
@@ -162,8 +155,7 @@ L(loop):
fmov synd, dend
cbz synd, L(loop)
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
#ifndef __AARCH64EB__
rbit synd, synd