aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sysdeps/powerpc/powerpc64/le/power9/strcpy.S160
1 files changed, 89 insertions, 71 deletions
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
index 76cfcae..b9e5afd 100644
--- a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
@@ -45,91 +45,78 @@
The implementation can load bytes past a null terminator, but only
up to the next 16B boundary, so it never crosses a page. */
+/* Load quadword at addr+offset to vreg, check for null bytes,
+ and branch to label if any are found. */
+#define CHECK16(vreg,offset,addr,label) \
+ lxv vreg+32,offset(addr); \
+ vcmpequb. v6,vreg,v18; \
+ bne cr6,L(label);
+
.machine power9
ENTRY_TOCLESS (FUNC_NAME, 4)
CALL_MCOUNT 2
- /* NULL string optimisation */
- lbz r0,0(r4)
- stb r0,0(r3)
- cmpwi r0,0
- beqlr
-
- addi r4,r4,1
- addi r11,r3,1
-
vspltisb v18,0 /* Zeroes in v18 */
+ vspltisb v19,-1 /* 0xFF bytes in v19 */
- neg r5,r4
- rldicl r9,r5,0,60 /* How many bytes to get source 16B aligned? */
+ /* Next 16B-aligned address. Prepare address for L(loop). */
+ addi r5,r4,16
+ clrrdi r5,r5,4
+ subf r8,r4,r5
+ add r11,r3,r8
- /* Get source 16B aligned */
+ /* Align data and fill bytes not loaded with non matching char. */
lvx v0,0,r4
lvsr v1,0,r4
- vperm v0,v18,v0,v1
-
- vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
- vctzlsbb r7,v6 /* Number of trailing zeroes */
- addi r8,r7,1 /* Add null terminator */
+ vperm v0,v19,v0,v1
- /* r8 = bytes including null
- r9 = bytes to get source 16B aligned
- if r8 > r9
- no null, copy r9 bytes
- else
- there is a null, copy r8 bytes and return. */
- cmpd r8,r9
- bgt L(no_null)
+ vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
+ beq cr6,L(no_null)
- sldi r10,r8,56 /* stxvl wants size in top 8 bits */
- stxvl 32+v0,r11,r10 /* Partial store */
+ /* There's a null byte. */
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ addi r9,r8,1 /* Add null byte. */
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits. */
+ stxvl 32+v0,r3,r10 /* Partial store */
#ifdef USE_AS_STPCPY
/* stpcpy returns the dest address plus the size not counting the
final '\0'. */
- add r3,r11,r7
+ add r3,r3,r8
#endif
blr
L(no_null):
- sldi r10,r9,56 /* stxvl wants size in top 8 bits */
- stxvl 32+v0,r11,r10 /* Partial store */
-
- add r4,r4,r9
- add r11,r11,r9
+ sldi r10,r8,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r3,r10 /* Partial store */
+ .p2align 4
L(loop):
- lxv 32+v0,0(r4)
- vcmpequb. v6,v0,v18 /* Any zero bytes? */
- bne cr6,L(tail1)
-
- lxv 32+v1,16(r4)
- vcmpequb. v6,v1,v18 /* Any zero bytes? */
- bne cr6,L(tail2)
-
- lxv 32+v2,32(r4)
- vcmpequb. v6,v2,v18 /* Any zero bytes? */
- bne cr6,L(tail3)
-
- lxv 32+v3,48(r4)
- vcmpequb. v6,v3,v18 /* Any zero bytes? */
- bne cr6,L(tail4)
+ CHECK16(v0,0,r5,tail1)
+ CHECK16(v1,16,r5,tail2)
+ CHECK16(v2,32,r5,tail3)
+ CHECK16(v3,48,r5,tail4)
+ CHECK16(v4,64,r5,tail5)
+ CHECK16(v5,80,r5,tail6)
stxv 32+v0,0(r11)
stxv 32+v1,16(r11)
stxv 32+v2,32(r11)
stxv 32+v3,48(r11)
+ stxv 32+v4,64(r11)
+ stxv 32+v5,80(r11)
- addi r4,r4,64
- addi r11,r11,64
+ addi r5,r5,96
+ addi r11,r11,96
b L(loop)
+ .p2align 4
L(tail1):
- vctzlsbb r8,v6
- addi r9,r8,1
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ addi r9,r8,1 /* Add null terminator */
sldi r9,r9,56 /* stxvl wants size in top 8 bits */
- stxvl 32+v0,r11,r9
+ stxvl 32+v0,r11,r9 /* Partial store */
#ifdef USE_AS_STPCPY
/* stpcpy returns the dest address plus the size not counting the
final '\0'. */
@@ -137,50 +124,81 @@ L(tail1):
#endif
blr
+ .p2align 4
L(tail2):
stxv 32+v0,0(r11)
- vctzlsbb r8,v6 /* Number of trailing zeroes */
- addi r9,r8,1 /* Add null terminator */
- sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ vctzlsbb r8,v6
+ addi r9,r8,1
+ sldi r9,r9,56
addi r11,r11,16
- stxvl 32+v1,r11,r10 /* Partial store */
+ stxvl 32+v1,r11,r9
#ifdef USE_AS_STPCPY
- /* stpcpy returns the dest address plus the size not counting the
- final '\0'. */
add r3,r11,r8
#endif
blr
+ .p2align 4
L(tail3):
stxv 32+v0,0(r11)
stxv 32+v1,16(r11)
- vctzlsbb r8,v6 /* Number of trailing zeroes */
- addi r9,r8,1 /* Add null terminator */
- sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ vctzlsbb r8,v6
+ addi r9,r8,1
+ sldi r9,r9,56
addi r11,r11,32
- stxvl 32+v2,r11,r10 /* Partial store */
+ stxvl 32+v2,r11,r9
#ifdef USE_AS_STPCPY
- /* stpcpy returns the dest address plus the size not counting the
- final '\0'. */
add r3,r11,r8
#endif
blr
+ .p2align 4
L(tail4):
stxv 32+v0,0(r11)
stxv 32+v1,16(r11)
stxv 32+v2,32(r11)
- vctzlsbb r8,v6 /* Number of trailing zeroes */
- addi r9,r8,1 /* Add null terminator */
- sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ vctzlsbb r8,v6
+ addi r9,r8,1
+ sldi r9,r9,56
addi r11,r11,48
- stxvl 32+v3,r11,r10 /* Partial store */
+ stxvl 32+v3,r11,r9
#ifdef USE_AS_STPCPY
- /* stpcpy returns the dest address plus the size not counting the
- final '\0'. */
add r3,r11,r8
#endif
blr
+
+ .p2align 4
+L(tail5):
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ stxv 32+v3,48(r11)
+ vctzlsbb r8,v6
+ addi r9,r8,1
+ sldi r9,r9,56
+ addi r11,r11,64
+ stxvl 32+v4,r11,r9
+#ifdef USE_AS_STPCPY
+ add r3,r11,r8
+#endif
+ blr
+
+ .p2align 4
+L(tail6):
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ stxv 32+v3,48(r11)
+ stxv 32+v4,64(r11)
+ vctzlsbb r8,v6
+ addi r9,r8,1
+ sldi r9,r9,56
+ addi r11,r11,80
+ stxvl 32+v5,r11,r9
+#ifdef USE_AS_STPCPY
+ add r3,r11,r8
+#endif
+ blr
+
END (FUNC_NAME)
#ifndef USE_AS_STPCPY
libc_hidden_builtin_def (strcpy)