diff options
-rw-r--r-- | sysdeps/powerpc/powerpc64/le/power9/strcpy.S | 160 |
1 files changed, 89 insertions, 71 deletions
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S index 76cfcae..b9e5afd 100644 --- a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S @@ -45,91 +45,78 @@ The implementation can load bytes past a null terminator, but only up to the next 16B boundary, so it never crosses a page. */ +/* Load quadword at addr+offset to vreg, check for null bytes, + and branch to label if any are found. */ +#define CHECK16(vreg,offset,addr,label) \ + lxv vreg+32,offset(addr); \ + vcmpequb. v6,vreg,v18; \ + bne cr6,L(label); + .machine power9 ENTRY_TOCLESS (FUNC_NAME, 4) CALL_MCOUNT 2 - /* NULL string optimisation */ - lbz r0,0(r4) - stb r0,0(r3) - cmpwi r0,0 - beqlr - - addi r4,r4,1 - addi r11,r3,1 - vspltisb v18,0 /* Zeroes in v18 */ + vspltisb v19,-1 /* 0xFF bytes in v19 */ - neg r5,r4 - rldicl r9,r5,0,60 /* How many bytes to get source 16B aligned? */ + /* Next 16B-aligned address. Prepare address for L(loop). */ + addi r5,r4,16 + clrrdi r5,r5,4 + subf r8,r4,r5 + add r11,r3,r8 - /* Get source 16B aligned */ + /* Align data and fill bytes not loaded with non matching char. */ lvx v0,0,r4 lvsr v1,0,r4 - vperm v0,v18,v0,v1 - - vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ - vctzlsbb r7,v6 /* Number of trailing zeroes */ - addi r8,r7,1 /* Add null terminator */ + vperm v0,v19,v0,v1 - /* r8 = bytes including null - r9 = bytes to get source 16B aligned - if r8 > r9 - no null, copy r9 bytes - else - there is a null, copy r8 bytes and return. */ - cmpd r8,r9 - bgt L(no_null) + vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ + beq cr6,L(no_null) - sldi r10,r8,56 /* stxvl wants size in top 8 bits */ - stxvl 32+v0,r11,r10 /* Partial store */ + /* There's a null byte. */ + vctzlsbb r8,v6 /* Number of trailing zeroes */ + addi r9,r8,1 /* Add null byte. */ + sldi r10,r9,56 /* stxvl wants size in top 8 bits. */ + stxvl 32+v0,r3,r10 /* Partial store */ #ifdef USE_AS_STPCPY /* stpcpy returns the dest address plus the size not counting the final '\0'. */ - add r3,r11,r7 + add r3,r3,r8 #endif blr L(no_null): - sldi r10,r9,56 /* stxvl wants size in top 8 bits */ - stxvl 32+v0,r11,r10 /* Partial store */ - - add r4,r4,r9 - add r11,r11,r9 + sldi r10,r8,56 /* stxvl wants size in top 8 bits */ + stxvl 32+v0,r3,r10 /* Partial store */ + .p2align 4 L(loop): - lxv 32+v0,0(r4) - vcmpequb. v6,v0,v18 /* Any zero bytes? */ - bne cr6,L(tail1) - - lxv 32+v1,16(r4) - vcmpequb. v6,v1,v18 /* Any zero bytes? */ - bne cr6,L(tail2) - - lxv 32+v2,32(r4) - vcmpequb. v6,v2,v18 /* Any zero bytes? */ - bne cr6,L(tail3) - - lxv 32+v3,48(r4) - vcmpequb. v6,v3,v18 /* Any zero bytes? */ - bne cr6,L(tail4) + CHECK16(v0,0,r5,tail1) + CHECK16(v1,16,r5,tail2) + CHECK16(v2,32,r5,tail3) + CHECK16(v3,48,r5,tail4) + CHECK16(v4,64,r5,tail5) + CHECK16(v5,80,r5,tail6) stxv 32+v0,0(r11) stxv 32+v1,16(r11) stxv 32+v2,32(r11) stxv 32+v3,48(r11) + stxv 32+v4,64(r11) + stxv 32+v5,80(r11) - addi r4,r4,64 - addi r11,r11,64 + addi r5,r5,96 + addi r11,r11,96 b L(loop) + .p2align 4 L(tail1): - vctzlsbb r8,v6 - addi r9,r8,1 + vctzlsbb r8,v6 /* Number of trailing zeroes */ + addi r9,r8,1 /* Add null terminator */ sldi r9,r9,56 /* stxvl wants size in top 8 bits */ - stxvl 32+v0,r11,r9 + stxvl 32+v0,r11,r9 /* Partial store */ #ifdef USE_AS_STPCPY /* stpcpy returns the dest address plus the size not counting the final '\0'. */ @@ -137,50 +124,81 @@ L(tail1): #endif blr + .p2align 4 L(tail2): stxv 32+v0,0(r11) - vctzlsbb r8,v6 /* Number of trailing zeroes */ - addi r9,r8,1 /* Add null terminator */ - sldi r10,r9,56 /* stxvl wants size in top 8 bits */ + vctzlsbb r8,v6 + addi r9,r8,1 + sldi r9,r9,56 addi r11,r11,16 - stxvl 32+v1,r11,r10 /* Partial store */ + stxvl 32+v1,r11,r9 #ifdef USE_AS_STPCPY - /* stpcpy returns the dest address plus the size not counting the - final '\0'. */ add r3,r11,r8 #endif blr + .p2align 4 L(tail3): stxv 32+v0,0(r11) stxv 32+v1,16(r11) - vctzlsbb r8,v6 /* Number of trailing zeroes */ - addi r9,r8,1 /* Add null terminator */ - sldi r10,r9,56 /* stxvl wants size in top 8 bits */ + vctzlsbb r8,v6 + addi r9,r8,1 + sldi r9,r9,56 addi r11,r11,32 - stxvl 32+v2,r11,r10 /* Partial store */ + stxvl 32+v2,r11,r9 #ifdef USE_AS_STPCPY - /* stpcpy returns the dest address plus the size not counting the - final '\0'. */ add r3,r11,r8 #endif blr + .p2align 4 L(tail4): stxv 32+v0,0(r11) stxv 32+v1,16(r11) stxv 32+v2,32(r11) - vctzlsbb r8,v6 /* Number of trailing zeroes */ - addi r9,r8,1 /* Add null terminator */ - sldi r10,r9,56 /* stxvl wants size in top 8 bits */ + vctzlsbb r8,v6 + addi r9,r8,1 + sldi r9,r9,56 addi r11,r11,48 - stxvl 32+v3,r11,r10 /* Partial store */ + stxvl 32+v3,r11,r9 #ifdef USE_AS_STPCPY - /* stpcpy returns the dest address plus the size not counting the - final '\0'. */ add r3,r11,r8 #endif blr + + .p2align 4 +L(tail5): + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + stxv 32+v3,48(r11) + vctzlsbb r8,v6 + addi r9,r8,1 + sldi r9,r9,56 + addi r11,r11,64 + stxvl 32+v4,r11,r9 +#ifdef USE_AS_STPCPY + add r3,r11,r8 +#endif + blr + + .p2align 4 +L(tail6): + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + stxv 32+v3,48(r11) + stxv 32+v4,64(r11) + vctzlsbb r8,v6 + addi r9,r8,1 + sldi r9,r9,56 + addi r11,r11,80 + stxvl 32+v5,r11,r9 +#ifdef USE_AS_STPCPY + add r3,r11,r8 +#endif + blr + END (FUNC_NAME) #ifndef USE_AS_STPCPY libc_hidden_builtin_def (strcpy) |