diff options
author | MAHESH BODAPATI <bmahi496@linux.ibm.com> | 2023-12-12 08:52:45 -0600 |
---|---|---|
committer | Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com> | 2023-12-14 14:40:14 -0600 |
commit | b9182c793caa05df5d697427c0538936e6396d4b (patch) | |
tree | 2fe92e68ae203533eb5a2cc1520d84b15b3c992e | |
parent | d0aefec49941cf6d97e2244d6aa20bafc26d5942 (diff) | |
download | glibc-b9182c793caa05df5d697427c0538936e6396d4b.zip glibc-b9182c793caa05df5d697427c0538936e6396d4b.tar.gz glibc-b9182c793caa05df5d697427c0538936e6396d4b.tar.bz2 |
powerpc : Add optimized memchr for POWER10
Optimized memchr for POWER10 based on existing rawmemchr and strlen.
Reordering instructions and loop unrolling helped in getting better performance.
Reviewed-by: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
-rw-r--r-- | sysdeps/powerpc/powerpc64/le/power10/memchr.S | 315 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/Makefile | 8 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c | 6 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S | 28 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/memchr.c | 20 |
5 files changed, 367 insertions, 10 deletions
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memchr.S b/sysdeps/powerpc/powerpc64/le/power10/memchr.S new file mode 100644 index 0000000..faf293f --- /dev/null +++ b/sysdeps/powerpc/powerpc64/le/power10/memchr.S @@ -0,0 +1,315 @@ +/* Optimized memchr implementation for POWER10 LE. + Copyright (C) 2021-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +# ifndef MEMCHR +# define MEMCHR __memchr +# endif +# define M_VREG_ZERO v20 +# define M_OFF_START_LOOP 256 +# define MEMCHR_SUBTRACT_VECTORS \ + vsububm v4,v4,v18; \ + vsububm v5,v5,v18; \ + vsububm v6,v6,v18; \ + vsububm v7,v7,v18; +# define M_TAIL(vreg,increment) \ + vctzlsbb r4,vreg; \ + cmpld r5,r4; \ + ble L(null); \ + addi r4,r4,increment; \ + add r3,r6,r4; \ + blr + +/* TODO: Replace macros by the actual instructions when minimum binutils becomes + >= 2.35. This is used to keep compatibility with older versions. */ +#define M_VEXTRACTBM(rt,vrb) \ + .long(((4)<<(32-6)) \ + | ((rt)<<(32-11)) \ + | ((8)<<(32-16)) \ + | ((vrb)<<(32-21)) \ + | 1602) + +#define M_LXVP(xtp,dq,ra) \ + .long(((6)<<(32-6)) \ + | ((((xtp)-32)>>1)<<(32-10)) \ + | ((1)<<(32-11)) \ + | ((ra)<<(32-16)) \ + | dq) + +#define CHECK16B(vreg,offset,addr,label) \ + lxv vreg+32,offset(addr); \ + vcmpequb. vreg,vreg,v18; \ + bne cr6,L(label); \ + cmpldi r5,16; \ + ble L(null); \ + addi r5,r5,-16; + +/* Load 4 quadwords, merge into one VR for speed and check for NULLs. r6 has # + of bytes already checked. */ +#define CHECK64B(offset,addr,label) \ + M_LXVP(v4+32,offset,addr); \ + M_LXVP(v6+32,offset+32,addr); \ + MEMCHR_SUBTRACT_VECTORS; \ + vminub v14,v4,v5; \ + vminub v15,v6,v7; \ + vminub v16,v14,v15; \ + vcmpequb. v0,v16,M_VREG_ZERO; \ + beq cr6,$+12; \ + li r7,offset; \ + b L(label); \ + cmpldi r5,64; \ + ble L(null); \ + addi r5,r5,-64 + +/* Implements the function + void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5]). */ + + .machine power9 + +ENTRY_TOCLESS (MEMCHR) + CALL_MCOUNT 3 + + cmpldi r5,0 + beq L(null) + mr r0,r5 + xori r6,r4,0xff + + mtvsrd v18+32,r4 /* matching char in v18 */ + mtvsrd v19+32,r6 /* non matching char in v19 */ + + vspltb v18,v18,7 /* replicate */ + vspltb v19,v19,7 /* replicate */ + vspltisb M_VREG_ZERO,0 + + /* Next 16B-aligned address. Prepare address for L(aligned). */ + addi r6,r3,16 + clrrdi r6,r6,4 + + /* Align data and fill bytes not loaded with non matching char. */ + lvx v0,0,r3 + lvsr v1,0,r3 + vperm v0,v19,v0,v1 + + vcmpequb. v6,v0,v18 + bne cr6,L(found) + sub r4,r6,r3 + cmpld r5,r4 + ble L(null) + sub r5,r5,r4 + + /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is + optimized for longer strings, so checking the first bytes in 16B + chunks benefits a lot small strings. */ + .p2align 5 +L(aligned): + cmpldi r5,0 + beq L(null) + + CHECK16B(v0,0,r6,tail1) + CHECK16B(v1,16,r6,tail2) + CHECK16B(v2,32,r6,tail3) + CHECK16B(v3,48,r6,tail4) + CHECK16B(v4,64,r6,tail5) + CHECK16B(v5,80,r6,tail6) + CHECK16B(v6,96,r6,tail7) + CHECK16B(v7,112,r6,tail8) + CHECK16B(v8,128,r6,tail9) + CHECK16B(v9,144,r6,tail10) + CHECK16B(v10,160,r6,tail11) + CHECK16B(v0,176,r6,tail12) + CHECK16B(v1,192,r6,tail13) + CHECK16B(v2,208,r6,tail14) + CHECK16B(v3,224,r6,tail15) + + cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to + choose how we will perform the main loop. */ + + /* Prepare address for the loop. */ + addi r4,r3,M_OFF_START_LOOP + clrrdi r4,r4,6 + sub r6,r4,r3 + sub r5,r0,r6 + addi r6,r4,128 + + /* If c == 0, use the loop without the vsububm. */ + beq cr5,L(loop) + + /* This is very similar to the block after L(loop), the difference is + that here MEMCHR_SUBTRACT_VECTORS is not empty, and we subtract + each byte loaded by the char we are looking for, this way we can keep + using vminub to merge the results and checking for nulls. */ + .p2align 5 +L(memchr_loop): + CHECK64B(0,r4,pre_tail_64b) + CHECK64B(64,r4,pre_tail_64b) + addi r4,r4,256 + + CHECK64B(0,r6,tail_64b) + CHECK64B(64,r6,tail_64b) + addi r6,r6,256 + + CHECK64B(0,r4,pre_tail_64b) + CHECK64B(64,r4,pre_tail_64b) + addi r4,r4,256 + + CHECK64B(0,r6,tail_64b) + CHECK64B(64,r6,tail_64b) + addi r6,r6,256 + + b L(memchr_loop) + /* Switch to a more aggressive approach checking 64B each time. Use 2 + pointers 128B apart and unroll the loop once to make the pointer + updates and usages separated enough to avoid stalls waiting for + address calculation. */ + .p2align 5 +L(loop): +#undef MEMCHR_SUBTRACT_VECTORS +#define MEMCHR_SUBTRACT_VECTORS /* nothing */ + CHECK64B(0,r4,pre_tail_64b) + CHECK64B(64,r4,pre_tail_64b) + addi r4,r4,256 + + CHECK64B(0,r6,tail_64b) + CHECK64B(64,r6,tail_64b) + addi r6,r6,256 + + CHECK64B(0,r4,pre_tail_64b) + CHECK64B(64,r4,pre_tail_64b) + addi r4,r4,256 + + CHECK64B(0,r6,tail_64b) + CHECK64B(64,r6,tail_64b) + addi r6,r6,256 + + b L(loop) + + .p2align 5 +L(pre_tail_64b): + mr r6,r4 +L(tail_64b): + /* OK, we found a null byte. Let's look for it in the current 64-byte + block and mark it in its corresponding VR. lxvp vx,0(ry) puts the + low 16B bytes into vx+1, and the high into vx, so the order here is + v5, v4, v7, v6. */ + vcmpequb v1,v5,M_VREG_ZERO + vcmpequb v2,v4,M_VREG_ZERO + vcmpequb v3,v7,M_VREG_ZERO + vcmpequb v4,v6,M_VREG_ZERO + + /* Take into account the other 64B blocks we had already checked. */ + add r6,r6,r7 + /* Extract first bit of each byte. */ + M_VEXTRACTBM(r8,v1) + M_VEXTRACTBM(r9,v2) + M_VEXTRACTBM(r10,v3) + M_VEXTRACTBM(r11,v4) + + /* Shift each value into their corresponding position. */ + sldi r9,r9,16 + sldi r10,r10,32 + sldi r11,r11,48 + + /* Merge the results. */ + or r8,r8,r9 + or r9,r10,r11 + or r11,r9,r8 + + cnttzd r0,r11 /* Count trailing zeros before the match. */ + cmpld r5,r0 + ble L(null) + add r3,r6,r0 /* Compute final address. */ + blr + + .p2align 5 +L(tail1): + M_TAIL(v0,0) + + .p2align 5 +L(tail2): + M_TAIL(v1,16) + + .p2align 5 +L(tail3): + M_TAIL(v2,32) + + .p2align 5 +L(tail4): + M_TAIL(v3,48) + + .p2align 5 +L(tail5): + M_TAIL(v4,64) + + .p2align 5 +L(tail6): + M_TAIL(v5,80) + + .p2align 5 +L(tail7): + M_TAIL(v6,96) + + .p2align 5 +L(tail8): + M_TAIL(v7,112) + + .p2align 5 +L(tail9): + M_TAIL(v8,128) + + .p2align 5 +L(tail10): + M_TAIL(v9,144) + + .p2align 5 +L(tail11): + M_TAIL(v10,160) + + .p2align 5 +L(tail12): + M_TAIL(v0,176) + + .p2align 5 +L(tail13): + M_TAIL(v1,192) + + .p2align 5 +L(tail14): + M_TAIL(v2,208) + + .p2align 5 +L(tail15): + M_TAIL(v3,224) + + .p2align 5 +L(found): + vctzlsbb r7,v6 + cmpld r5,r7 + ble L(null) + add r3,r3,r7 + blr + + .p2align 5 +L(null): + li r3,0 + blr + +END (MEMCHR) + +weak_alias (__memchr, memchr) +libc_hidden_builtin_def (memchr) diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index d7824a9..594fbb8 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -31,10 +31,10 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ strncase-power8 ifneq (,$(filter %le,$(config-machine))) -sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \ - rawmemchr-power9 rawmemchr-power10 \ - strcmp-power9 strcmp-power10 strncmp-power9 \ - strcpy-power9 stpcpy-power9 \ +sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \ + memmove-power10 memset-power10 rawmemchr-power9 \ + rawmemchr-power10 strcmp-power9 strcmp-power10 \ + strncmp-power9 strcpy-power9 stpcpy-power9 \ strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 endif CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index 965dd17..1a34629 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -226,6 +226,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/memchr.c. */ IFUNC_IMPL (i, name, memchr, +#ifdef __LITTLE_ENDIAN__ + IFUNC_IMPL_ADD (array, i, memchr, + hwcap2 & PPC_FEATURE2_ARCH_3_1 + && hwcap & PPC_FEATURE_HAS_VSX, + __memchr_power10) +#endif IFUNC_IMPL_ADD (array, i, memchr, hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_ALTIVEC, diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S new file mode 100644 index 0000000..b9ed792 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S @@ -0,0 +1,28 @@ +/* Optimized memchr implementation for POWER10/PPC64. + Copyright (C) 2016-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if defined __LITTLE_ENDIAN__ && IS_IN (libc) +#define MEMCHR __memchr_power10 + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) +#undef weak_alias +#define weak_alias(name,alias) + +#include <sysdeps/powerpc/powerpc64/le/power10/memchr.S> +#endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c index a1a8f3e..dbb30fe 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c +++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c @@ -25,15 +25,23 @@ extern __typeof (__memchr) __memchr_ppc attribute_hidden; extern __typeof (__memchr) __memchr_power7 attribute_hidden; extern __typeof (__memchr) __memchr_power8 attribute_hidden; +# ifdef __LITTLE_ENDIAN__ +extern __typeof (__memchr) __memchr_power10 attribute_hidden; +# endif /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle ifunc symbol properly. */ libc_ifunc (__memchr, - (hwcap2 & PPC_FEATURE2_ARCH_2_07 - && hwcap & PPC_FEATURE_HAS_ALTIVEC) - ? __memchr_power8 : - (hwcap & PPC_FEATURE_ARCH_2_06) - ? __memchr_power7 - : __memchr_ppc); +# ifdef __LITTLE_ENDIAN__ + (hwcap2 & PPC_FEATURE2_ARCH_3_1 + && hwcap & PPC_FEATURE_HAS_VSX) + ? __memchr_power10 : +# endif + (hwcap2 & PPC_FEATURE2_ARCH_2_07 + && hwcap & PPC_FEATURE_HAS_ALTIVEC) + ? __memchr_power8 : + (hwcap & PPC_FEATURE_ARCH_2_06) + ? __memchr_power7 + : __memchr_ppc); weak_alias (__memchr, memchr) libc_hidden_builtin_def (memchr) |