diff options
-rw-r--r-- | ChangeLog | 10 | ||||
-rw-r--r-- | NEWS | 3 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c | 3 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S | 40 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/strcmp.c | 7 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/strcmp.S | 257 |
7 files changed, 318 insertions, 4 deletions
@@ -1,6 +1,16 @@ 2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com> * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: + Add strcmp-power8 object. + * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c + (__libc_ifunc_impl_list): Add __strcmp_power8 implementation. + * sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S: New file. + * sysdeps/powerpc/powerpc64/multiarch/strcmp.c (strcmp): Add + __strcmp_power8 implementation. + * sysdeps/powerpc/powerpc64/power8/strcmp.S: New file. + * NEWS: Update. + + * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add strncpy-power8 and stpncpy-power8 objects. * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add __strncpy_power8 and stpncpy_power8 @@ -19,8 +19,9 @@ Version 2.21 17744, 17745, 17746, 17747, 17748, 17775, 17777, 17780, 17781, 17782, 17791, 17793, 17796, 17797, 17803, 17806, 17834 -* Optimized strcpy, stpcpy, strncpy, stpncpy implementations for +* Optimized strcpy, stpcpy, strncpy, stpncpy, and strcmp implementations for powerpc64/powerpc64le. + Implemented by Adhemerval Zanella (IBM). * Added support for TSX lock elision of pthread mutexes on powerpc32, powerpc64 and powerpc64le. This may improve lock scaling of existing programs on diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index 18d3378..ec4fca7 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -18,7 +18,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \ strncpy-power7 strncpy-ppc64 \ stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \ - strcmp-power7 strcmp-ppc64 \ + strcmp-power8 strcmp-power7 strcmp-ppc64 \ strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \ memmove-ppc64 bcopy-ppc64 strncpy-power8 diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index 132cb13..2c03060 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -301,6 +301,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c. */ IFUNC_IMPL (i, name, strcmp, IFUNC_IMPL_ADD (array, i, strcmp, + hwcap2 & PPC_FEATURE2_ARCH_2_07, + __strcmp_power8) + IFUNC_IMPL_ADD (array, i, strcmp, hwcap & PPC_FEATURE_HAS_VSX, __strcmp_power7) IFUNC_IMPL_ADD (array, i, strcmp, 1, diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S new file mode 100644 index 0000000..dc4bfac --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S @@ -0,0 +1,40 @@ +/* Optimized strcmp implementation for POWER8/PPC64. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#undef EALIGN +#define EALIGN(name, alignt, words) \ + .section ".text"; \ + ENTRY_2(__strcmp_power8) \ + .align ALIGNARG(alignt); \ + EALIGN_W_##words; \ + BODY_LABEL(__strcmp_power8): \ + cfi_startproc; \ + LOCALENTRY(__strcmp_power8) + +#undef END +#define END(name) \ + cfi_endproc; \ + TRACEBACK(__strcmp_power8) \ + END_2(__strcmp_power8) + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include <sysdeps/powerpc/powerpc64/power8/strcmp.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c index 9b2922f..b45ba1f 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c @@ -23,9 +23,12 @@ extern __typeof (strcmp) __strcmp_ppc attribute_hidden; extern __typeof (strcmp) __strcmp_power7 attribute_hidden; +extern __typeof (strcmp) __strcmp_power8 attribute_hidden; libc_ifunc (strcmp, - (hwcap & PPC_FEATURE_HAS_VSX) - ? __strcmp_power7 + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __strcmp_power8 : + (hwcap & PPC_FEATURE_HAS_VSX) + ? __strcmp_power7 : __strcmp_ppc); #endif diff --git a/sysdeps/powerpc/powerpc64/power8/strcmp.S b/sysdeps/powerpc/powerpc64/power8/strcmp.S new file mode 100644 index 0000000..223d891 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/strcmp.S @@ -0,0 +1,257 @@ +/* Optimized strcmp implementation for PowerPC64/POWER8. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Implements the function + + size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) + + The implementation uses unaligned doubleword access to avoid specialized + code paths depending of data alignment. Although recent powerpc64 uses + 64K as default, the page cross handling assumes minimum page size of + 4k. */ + +EALIGN (strcmp, 4, 0) + li r0,0 + + /* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using + the code: + + (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE)) + + with PAGE_SIZE being 4096 and ITER_SIZE begin 32. */ + + rldicl r7,r3,0,52 + rldicl r9,r4,0,52 + cmpldi cr7,r7,4096-32 + bgt cr7,L(pagecross_check) + cmpldi cr5,r9,4096-32 + bgt cr5,L(pagecross_check) + + /* For short string up to 32 bytes, load both s1 and s2 using + unaligned dwords and compare. */ + ld r8,0(r3) + ld r10,0(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + ld r8,8(r3) + ld r10,8(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + ld r8,16(r3) + ld r10,16(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + ld r8,24(r3) + ld r10,24(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + addi r7,r3,32 + addi r4,r4,32 + +L(align_8b): + /* Now it has checked for first 32 bytes, align source1 to doubleword + and adjust source2 address. */ + rldicl r9,r7,0,61 /* source1 alignment to doubleword */ + subf r4,r9,r4 /* Adjust source2 address based on source1 + alignment. */ + rldicr r7,r7,0,60 /* Align source1 to doubleword. */ + + /* At this point, source1 alignment is 0 and source2 alignment is + between 0 and 7. Check is source2 alignment is 0, meaning both + sources have the same alignment. */ + andi. r9,r4,0x7 + bne cr0,L(loop_diff_align) + + /* If both source1 and source2 are doubleword aligned, there is no + need for page boundary cross checks. */ + + ld r8,0(r7) + ld r10,0(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + .align 4 +L(loop_equal_align): + ld r8,8(r7) + ld r10,8(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + ld r8,16(r7) + ld r10,16(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + ldu r8,24(r7) + ldu r10,24(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + b L(loop_equal_align) + + /* A zero byte was found in r8 (s1 dword), r9 contains the cmpb + result and r10 the dword from s2. To code isolate the byte + up to end (including the '\0'), masking with 0xFF the remaining + ones: + + #if __LITTLE_ENDIAN__ + (__builtin_ffsl (x) - 1) = counting trailing zero bits + r9 = (__builtin_ffsl (r9) - 1) + 8; + r9 = -1UL << r9 + #else + r9 = __builtin_clzl (r9) + 8; + r9 = -1UL >> r9 + #endif + r8 = r8 | r9 + r10 = r10 | r9 */ + +#ifdef __LITTLE_ENDIAN__ + nor r9,r9,r9 +L(different_nocmpb): + neg r3,r9 + and r9,r9,r3 + cntlzd r9,r9 + subfic r9,r9,63 +#else + not r9,r9 +L(different_nocmpb): + cntlzd r9,r9 + subfic r9,r9,56 +#endif + srd r3,r8,r9 + srd r10,r10,r9 + rldicl r10,r10,0,56 + rldicl r3,r3,0,56 + subf r3,r10,r3 + extsw r3,r3 + blr + + .align 4 +L(pagecross_check): + subfic r9,r9,4096 + subfic r7,r7,4096 + cmpld cr7,r7,r9 + bge cr7,L(pagecross) + mr r7,r9 + + /* If unaligned 16 bytes reads across a 4K page boundary, it uses + a simple byte a byte comparison until the page alignment for s1 + is reached. */ +L(pagecross): + add r7,r3,r7 + subf r9,r3,r7 + mtctr r9 + + .align 4 +L(pagecross_loop): + /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2 + and if *s1 is '\0'. */ + lbz r9,0(r3) + lbz r10,0(r4) + addi r3,r3,1 + addi r4,r4,1 + cmplw cr7,r9,r10 + cmpdi cr5,r9,r0 + bne cr7,L(pagecross_ne) + beq cr5,L(pagecross_nullfound) + bdnz L(pagecross_loop) + b L(align_8b) + + .align 4 + /* The unaligned read of source2 will cross a 4K page boundary, + and the different byte or NULL maybe be in the remaining page + bytes. Since it can not use the unaligned load, the algorithm + reads and compares 8 bytes to keep source1 doubleword aligned. */ +L(check_source2_byte): + li r9,8 + mtctr r9 + + .align 4 +L(check_source2_byte_loop): + lbz r9,0(r7) + lbz r10,0(r4) + addi r7,r7,1 + addi r4,r4,1 + cmplw cr7,r9,10 + cmpdi r5,r9,0 + bne cr7,L(pagecross_ne) + beq cr5,L(pagecross_nullfound) + bdnz L(check_source2_byte_loop) + + /* If source2 is unaligned to doubleword, the code needs to check + on each interation if the unaligned doubleword access will cross + a 4k page boundary. */ + .align 5 +L(loop_unaligned): + ld r8,0(r7) + ld r10,0(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + addi r7,r7,8 + addi r4,r4,8 + +L(loop_diff_align): + /* Check if [src2]+8 cross a 4k page boundary: + + srcin2 % PAGE_SIZE > (PAGE_SIZE - 8) + + with PAGE_SIZE being 4096. */ + rldicl r9,r4,0,52 + cmpldi cr7,r9,4088 + ble cr7,L(loop_unaligned) + b L(check_source2_byte) + + .align 4 +L(pagecross_ne): + extsw r3,r9 + mr r9,r10 +L(pagecross_retdiff): + subf r9,r9,r3 + extsw r3,r9 + blr + + .align 4 +L(pagecross_nullfound): + li r3,0 + b L(pagecross_retdiff) +END (strcmp) +libc_hidden_builtin_def (strcmp) |