diff options
-rw-r--r-- | ChangeLog | 11 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/Makefile | 2 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/ifunc-impl-list.c | 5 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/memchr.S | 59 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/memchr_impl.S | 219 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/memchr_neon.S | 9 |
6 files changed, 304 insertions, 1 deletions
@@ -1,3 +1,14 @@ +2017-06-27 Prakhar Bahuguna <prakhar.bahuguna@arm.com> + + * sysdeps/arm/armv7/multiarch/Makefile: Add memchr_neon to + sysdep_routines. + * sysdeps/arm/armv7/multiarch/ifunc-impl-list.c: Add define for + __memchr_neon. + Add ifunc definitions for __memchr_neon and __memchr_noneon. + * sysdeps/arm/armv7/multiarch/memchr.S: New file. + * sysdeps/arm/armv7/multiarch/memchr_impl.S: Likewise. + * sysdeps/arm/armv7/multiarch/memchr_neon.S: Likewise. + 2017-06-27 Stefan Liebler <stli@linux.vnet.ibm.com> * sysdeps/s390/utf8-utf16-z9.c (__to_utf8_loop_vx_cu): diff --git a/sysdeps/arm/armv7/multiarch/Makefile b/sysdeps/arm/armv7/multiarch/Makefile index e834cc9..9e1e61c 100644 --- a/sysdeps/arm/armv7/multiarch/Makefile +++ b/sysdeps/arm/armv7/multiarch/Makefile @@ -1,3 +1,3 @@ ifeq ($(subdir),string) -sysdep_routines += memcpy_neon memcpy_vfp +sysdep_routines += memcpy_neon memcpy_vfp memchr_neon endif diff --git a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c index b8094fd..8f33156 100644 --- a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c +++ b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c @@ -34,6 +34,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, bool use_neon = true; #ifdef __ARM_NEON__ # define __memcpy_neon memcpy +# define __memchr_neon memchr #else use_neon = (GLRO(dl_hwcap) & HWCAP_ARM_NEON) != 0; #endif @@ -52,5 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, #endif IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm)); + IFUNC_IMPL (i, name, memchr, + IFUNC_IMPL_ADD (array, i, memchr, use_neon, __memchr_neon) + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_noneon)); + return i; } diff --git a/sysdeps/arm/armv7/multiarch/memchr.S b/sysdeps/arm/armv7/multiarch/memchr.S new file mode 100644 index 0000000..8e8097a --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/memchr.S @@ -0,0 +1,59 @@ +/* Multiple versions of memchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2013-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <rtld-global-offsets.h> + +#if IS_IN (libc) +/* Under __ARM_NEON__, memchr_neon.S defines the name memchr. */ +# ifndef __ARM_NEON__ + .text + .arm +ENTRY(memchr) + .type memchr, %gnu_indirect_function + ldr r1, .Lmemchr_noneon + tst r0, #HWCAP_ARM_NEON + ldrne r1, .Lmemchr_neon +1: + add r0, r1, pc + DO_RET(lr) + +.Lmemchr_noneon: + .long C_SYMBOL_NAME(__memchr_noneon) - 1b - 8 +.Lmemchr_neon: + .long C_SYMBOL_NAME(__memchr_neon) - 1b - 8 + +END(memchr) + +libc_hidden_builtin_def (memchr) +# endif /* Not __ARM_NEON__. */ +libc_hidden_def (__memchr_noneon) + +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) +# undef weak_alias +# define weak_alias(x, y) +# undef libc_hidden_def +# define libc_hidden_def(name) + +# define memchr __memchr_noneon + +#endif + +#include "memchr_impl.S" diff --git a/sysdeps/arm/armv7/multiarch/memchr_impl.S b/sysdeps/arm/armv7/multiarch/memchr_impl.S new file mode 100644 index 0000000..e8cbb97 --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/memchr_impl.S @@ -0,0 +1,219 @@ +/* memchr implemented using NEON. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifdef MEMCHR_NEON + +#include <sysdep.h> + + .arch armv7-a + .fpu neon + + +/* Arguments */ +#define srcin r0 +#define chrin r1 +#define cntin r2 + +/* Retval */ +#define result r0 /* Live range does not overlap with srcin */ + +/* Working registers */ +#define src r1 /* Live range does not overlap with chrin */ +#define tmp r3 +#define synd r0 /* No overlap with srcin or result */ +#define soff r12 + +/* Working NEON registers */ +#define vrepchr q0 +#define vdata0 q1 +#define vdata0_0 d2 /* Lower half of vdata0 */ +#define vdata0_1 d3 /* Upper half of vdata0 */ +#define vdata1 q2 +#define vdata1_0 d4 /* Lower half of vhas_chr0 */ +#define vdata1_1 d5 /* Upper half of vhas_chr0 */ +#define vrepmask q3 +#define vrepmask0 d6 +#define vrepmask1 d7 +#define vend q4 +#define vend0 d8 +#define vend1 d9 + +/* + * Core algorithm: + * + * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per + * byte. Each bit is set if the relevant byte matched the requested character + * and cleared otherwise. Since the bits in the syndrome reflect exactly the + * order in which things occur in the original string, counting trailing zeros + * allows to identify exactly which byte has matched. + */ + +#ifndef NO_THUMB + .thumb_func +#else + .arm +#endif + .p2align 4,,15 + +ENTRY(memchr) + /* Use a simple loop if there are less than 8 bytes to search. */ + cmp cntin, #7 + bhi .Llargestr + and chrin, chrin, #0xff + +.Lsmallstr: + subs cntin, cntin, #1 + blo .Lnotfound /* Return not found if reached end. */ + ldrb tmp, [srcin], #1 + cmp tmp, chrin + bne .Lsmallstr /* Loop again if not found. */ + /* Otherwise fixup address and return. */ + sub result, srcin, #1 + bx lr + + +.Llargestr: + vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */ + /* + * Magic constant 0x8040201008040201 allows us to identify which lane + * matches the requested byte. + */ + movw tmp, #0x0201 + movt tmp, #0x0804 + lsl soff, tmp, #4 + vmov vrepmask0, tmp, soff + vmov vrepmask1, tmp, soff + /* Work with aligned 32-byte chunks */ + bic src, srcin, #31 + ands soff, srcin, #31 + beq .Lloopintro /* Go straight to main loop if it's aligned. */ + + /* + * Input string is not 32-byte aligned. We calculate the syndrome + * value for the aligned 32 bytes block containing the first bytes + * and mask the irrelevant part. + */ + vld1.8 {vdata0, vdata1}, [src:256]! + sub tmp, soff, #32 + adds cntin, cntin, tmp + vceq.i8 vdata0, vdata0, vrepchr + vceq.i8 vdata1, vdata1, vrepchr + vand vdata0, vdata0, vrepmask + vand vdata1, vdata1, vrepmask + vpadd.i8 vdata0_0, vdata0_0, vdata0_1 + vpadd.i8 vdata1_0, vdata1_0, vdata1_1 + vpadd.i8 vdata0_0, vdata0_0, vdata1_0 + vpadd.i8 vdata0_0, vdata0_0, vdata0_0 + vmov synd, vdata0_0[0] + + /* Clear the soff lower bits */ + lsr synd, synd, soff + lsl synd, synd, soff + /* The first block can also be the last */ + bls .Lmasklast + /* Have we found something already? */ +#ifndef NO_THUMB + cbnz synd, .Ltail +#else + cmp synd, #0 + bne .Ltail +#endif + + +.Lloopintro: + vpush {vend} + /* 264/265 correspond to d8/d9 for q4 */ + cfi_adjust_cfa_offset (16) + cfi_rel_offset (264, 0) + cfi_rel_offset (265, 8) + .p2align 3,,7 +.Lloop: + vld1.8 {vdata0, vdata1}, [src:256]! + subs cntin, cntin, #32 + vceq.i8 vdata0, vdata0, vrepchr + vceq.i8 vdata1, vdata1, vrepchr + /* If we're out of data we finish regardless of the result. */ + bls .Lend + /* Use a fast check for the termination condition. */ + vorr vend, vdata0, vdata1 + vorr vend0, vend0, vend1 + vmov synd, tmp, vend0 + orrs synd, synd, tmp + /* We're not out of data, loop if we haven't found the character. */ + beq .Lloop + +.Lend: + vpop {vend} + cfi_adjust_cfa_offset (-16) + cfi_restore (264) + cfi_restore (265) + + /* Termination condition found, let's calculate the syndrome value. */ + vand vdata0, vdata0, vrepmask + vand vdata1, vdata1, vrepmask + vpadd.i8 vdata0_0, vdata0_0, vdata0_1 + vpadd.i8 vdata1_0, vdata1_0, vdata1_1 + vpadd.i8 vdata0_0, vdata0_0, vdata1_0 + vpadd.i8 vdata0_0, vdata0_0, vdata0_0 + vmov synd, vdata0_0[0] +#ifndef NO_THUMB + cbz synd, .Lnotfound + bhi .Ltail /* Uses the condition code from + subs cntin, cntin, #32 above. */ +#else + cmp synd, #0 + beq .Lnotfound + cmp cntin, #0 + bhi .Ltail +#endif + + +.Lmasklast: + /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */ + neg cntin, cntin + lsl synd, synd, cntin + lsrs synd, synd, cntin + it eq + moveq src, #0 /* If no match, set src to 0 so the retval is 0. */ + + +.Ltail: + /* Count the trailing zeros using bit reversing */ + rbit synd, synd + /* Compensate the last post-increment */ + sub src, src, #32 + /* Count the leading zeros */ + clz synd, synd + /* Compute the potential result and return */ + add result, src, synd + bx lr + + +.Lnotfound: + /* Set result to NULL if not found and return */ + mov result, #0 + bx lr + +END(memchr) +libc_hidden_builtin_def (memchr) + +#else + +#include "../../armv6t2/memchr.S" + +#endif diff --git a/sysdeps/arm/armv7/multiarch/memchr_neon.S b/sysdeps/arm/armv7/multiarch/memchr_neon.S new file mode 100644 index 0000000..ee21818 --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/memchr_neon.S @@ -0,0 +1,9 @@ +#ifdef __ARM_NEON__ +/* Under __ARM_NEON__, this file defines memchr directly. */ +libc_hidden_builtin_def (memchr) +#else +# define memchr __memchr_neon +#endif + +#define MEMCHR_NEON +#include "memchr_impl.S" |