aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog11
-rw-r--r--sysdeps/arm/armv7/multiarch/Makefile2
-rw-r--r--sysdeps/arm/armv7/multiarch/ifunc-impl-list.c5
-rw-r--r--sysdeps/arm/armv7/multiarch/memchr.S59
-rw-r--r--sysdeps/arm/armv7/multiarch/memchr_impl.S219
-rw-r--r--sysdeps/arm/armv7/multiarch/memchr_neon.S9
6 files changed, 304 insertions, 1 deletions
diff --git a/ChangeLog b/ChangeLog
index 8ef682a..ad24611 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2017-06-27 Prakhar Bahuguna <prakhar.bahuguna@arm.com>
+
+ * sysdeps/arm/armv7/multiarch/Makefile: Add memchr_neon to
+ sysdep_routines.
+ * sysdeps/arm/armv7/multiarch/ifunc-impl-list.c: Add define for
+ __memchr_neon.
+ Add ifunc definitions for __memchr_neon and __memchr_noneon.
+ * sysdeps/arm/armv7/multiarch/memchr.S: New file.
+ * sysdeps/arm/armv7/multiarch/memchr_impl.S: Likewise.
+ * sysdeps/arm/armv7/multiarch/memchr_neon.S: Likewise.
+
2017-06-27 Stefan Liebler <stli@linux.vnet.ibm.com>
* sysdeps/s390/utf8-utf16-z9.c (__to_utf8_loop_vx_cu):
diff --git a/sysdeps/arm/armv7/multiarch/Makefile b/sysdeps/arm/armv7/multiarch/Makefile
index e834cc9..9e1e61c 100644
--- a/sysdeps/arm/armv7/multiarch/Makefile
+++ b/sysdeps/arm/armv7/multiarch/Makefile
@@ -1,3 +1,3 @@
ifeq ($(subdir),string)
-sysdep_routines += memcpy_neon memcpy_vfp
+sysdep_routines += memcpy_neon memcpy_vfp memchr_neon
endif
diff --git a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
index b8094fd..8f33156 100644
--- a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
+++ b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
@@ -34,6 +34,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
bool use_neon = true;
#ifdef __ARM_NEON__
# define __memcpy_neon memcpy
+# define __memchr_neon memchr
#else
use_neon = (GLRO(dl_hwcap) & HWCAP_ARM_NEON) != 0;
#endif
@@ -52,5 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#endif
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm));
+ IFUNC_IMPL (i, name, memchr,
+ IFUNC_IMPL_ADD (array, i, memchr, use_neon, __memchr_neon)
+ IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_noneon));
+
return i;
}
diff --git a/sysdeps/arm/armv7/multiarch/memchr.S b/sysdeps/arm/armv7/multiarch/memchr.S
new file mode 100644
index 0000000..8e8097a
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memchr.S
@@ -0,0 +1,59 @@
+/* Multiple versions of memchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2013-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <rtld-global-offsets.h>
+
+#if IS_IN (libc)
+/* Under __ARM_NEON__, memchr_neon.S defines the name memchr. */
+# ifndef __ARM_NEON__
+ .text
+ .arm
+ENTRY(memchr)
+ .type memchr, %gnu_indirect_function
+ ldr r1, .Lmemchr_noneon
+ tst r0, #HWCAP_ARM_NEON
+ ldrne r1, .Lmemchr_neon
+1:
+ add r0, r1, pc
+ DO_RET(lr)
+
+.Lmemchr_noneon:
+ .long C_SYMBOL_NAME(__memchr_noneon) - 1b - 8
+.Lmemchr_neon:
+ .long C_SYMBOL_NAME(__memchr_neon) - 1b - 8
+
+END(memchr)
+
+libc_hidden_builtin_def (memchr)
+# endif /* Not __ARM_NEON__. */
+libc_hidden_def (__memchr_noneon)
+
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)
+# undef weak_alias
+# define weak_alias(x, y)
+# undef libc_hidden_def
+# define libc_hidden_def(name)
+
+# define memchr __memchr_noneon
+
+#endif
+
+#include "memchr_impl.S"
diff --git a/sysdeps/arm/armv7/multiarch/memchr_impl.S b/sysdeps/arm/armv7/multiarch/memchr_impl.S
new file mode 100644
index 0000000..e8cbb97
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memchr_impl.S
@@ -0,0 +1,219 @@
+/* memchr implemented using NEON.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifdef MEMCHR_NEON
+
+#include <sysdep.h>
+
+ .arch armv7-a
+ .fpu neon
+
+
+/* Arguments */
+#define srcin r0
+#define chrin r1
+#define cntin r2
+
+/* Retval */
+#define result r0 /* Live range does not overlap with srcin */
+
+/* Working registers */
+#define src r1 /* Live range does not overlap with chrin */
+#define tmp r3
+#define synd r0 /* No overlap with srcin or result */
+#define soff r12
+
+/* Working NEON registers */
+#define vrepchr q0
+#define vdata0 q1
+#define vdata0_0 d2 /* Lower half of vdata0 */
+#define vdata0_1 d3 /* Upper half of vdata0 */
+#define vdata1 q2
+#define vdata1_0 d4 /* Lower half of vhas_chr0 */
+#define vdata1_1 d5 /* Upper half of vhas_chr0 */
+#define vrepmask q3
+#define vrepmask0 d6
+#define vrepmask1 d7
+#define vend q4
+#define vend0 d8
+#define vend1 d9
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per
+ * byte. Each bit is set if the relevant byte matched the requested character
+ * and cleared otherwise. Since the bits in the syndrome reflect exactly the
+ * order in which things occur in the original string, counting trailing zeros
+ * allows to identify exactly which byte has matched.
+ */
+
+#ifndef NO_THUMB
+ .thumb_func
+#else
+ .arm
+#endif
+ .p2align 4,,15
+
+ENTRY(memchr)
+ /* Use a simple loop if there are less than 8 bytes to search. */
+ cmp cntin, #7
+ bhi .Llargestr
+ and chrin, chrin, #0xff
+
+.Lsmallstr:
+ subs cntin, cntin, #1
+ blo .Lnotfound /* Return not found if reached end. */
+ ldrb tmp, [srcin], #1
+ cmp tmp, chrin
+ bne .Lsmallstr /* Loop again if not found. */
+ /* Otherwise fixup address and return. */
+ sub result, srcin, #1
+ bx lr
+
+
+.Llargestr:
+ vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */
+ /*
+ * Magic constant 0x8040201008040201 allows us to identify which lane
+ * matches the requested byte.
+ */
+ movw tmp, #0x0201
+ movt tmp, #0x0804
+ lsl soff, tmp, #4
+ vmov vrepmask0, tmp, soff
+ vmov vrepmask1, tmp, soff
+ /* Work with aligned 32-byte chunks */
+ bic src, srcin, #31
+ ands soff, srcin, #31
+ beq .Lloopintro /* Go straight to main loop if it's aligned. */
+
+ /*
+ * Input string is not 32-byte aligned. We calculate the syndrome
+ * value for the aligned 32 bytes block containing the first bytes
+ * and mask the irrelevant part.
+ */
+ vld1.8 {vdata0, vdata1}, [src:256]!
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ vceq.i8 vdata0, vdata0, vrepchr
+ vceq.i8 vdata1, vdata1, vrepchr
+ vand vdata0, vdata0, vrepmask
+ vand vdata1, vdata1, vrepmask
+ vpadd.i8 vdata0_0, vdata0_0, vdata0_1
+ vpadd.i8 vdata1_0, vdata1_0, vdata1_1
+ vpadd.i8 vdata0_0, vdata0_0, vdata1_0
+ vpadd.i8 vdata0_0, vdata0_0, vdata0_0
+ vmov synd, vdata0_0[0]
+
+ /* Clear the soff lower bits */
+ lsr synd, synd, soff
+ lsl synd, synd, soff
+ /* The first block can also be the last */
+ bls .Lmasklast
+ /* Have we found something already? */
+#ifndef NO_THUMB
+ cbnz synd, .Ltail
+#else
+ cmp synd, #0
+ bne .Ltail
+#endif
+
+
+.Lloopintro:
+ vpush {vend}
+ /* 264/265 correspond to d8/d9 for q4 */
+ cfi_adjust_cfa_offset (16)
+ cfi_rel_offset (264, 0)
+ cfi_rel_offset (265, 8)
+ .p2align 3,,7
+.Lloop:
+ vld1.8 {vdata0, vdata1}, [src:256]!
+ subs cntin, cntin, #32
+ vceq.i8 vdata0, vdata0, vrepchr
+ vceq.i8 vdata1, vdata1, vrepchr
+ /* If we're out of data we finish regardless of the result. */
+ bls .Lend
+ /* Use a fast check for the termination condition. */
+ vorr vend, vdata0, vdata1
+ vorr vend0, vend0, vend1
+ vmov synd, tmp, vend0
+ orrs synd, synd, tmp
+ /* We're not out of data, loop if we haven't found the character. */
+ beq .Lloop
+
+.Lend:
+ vpop {vend}
+ cfi_adjust_cfa_offset (-16)
+ cfi_restore (264)
+ cfi_restore (265)
+
+ /* Termination condition found, let's calculate the syndrome value. */
+ vand vdata0, vdata0, vrepmask
+ vand vdata1, vdata1, vrepmask
+ vpadd.i8 vdata0_0, vdata0_0, vdata0_1
+ vpadd.i8 vdata1_0, vdata1_0, vdata1_1
+ vpadd.i8 vdata0_0, vdata0_0, vdata1_0
+ vpadd.i8 vdata0_0, vdata0_0, vdata0_0
+ vmov synd, vdata0_0[0]
+#ifndef NO_THUMB
+ cbz synd, .Lnotfound
+ bhi .Ltail /* Uses the condition code from
+ subs cntin, cntin, #32 above. */
+#else
+ cmp synd, #0
+ beq .Lnotfound
+ cmp cntin, #0
+ bhi .Ltail
+#endif
+
+
+.Lmasklast:
+ /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */
+ neg cntin, cntin
+ lsl synd, synd, cntin
+ lsrs synd, synd, cntin
+ it eq
+ moveq src, #0 /* If no match, set src to 0 so the retval is 0. */
+
+
+.Ltail:
+ /* Count the trailing zeros using bit reversing */
+ rbit synd, synd
+ /* Compensate the last post-increment */
+ sub src, src, #32
+ /* Count the leading zeros */
+ clz synd, synd
+ /* Compute the potential result and return */
+ add result, src, synd
+ bx lr
+
+
+.Lnotfound:
+ /* Set result to NULL if not found and return */
+ mov result, #0
+ bx lr
+
+END(memchr)
+libc_hidden_builtin_def (memchr)
+
+#else
+
+#include "../../armv6t2/memchr.S"
+
+#endif
diff --git a/sysdeps/arm/armv7/multiarch/memchr_neon.S b/sysdeps/arm/armv7/multiarch/memchr_neon.S
new file mode 100644
index 0000000..ee21818
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memchr_neon.S
@@ -0,0 +1,9 @@
+#ifdef __ARM_NEON__
+/* Under __ARM_NEON__, this file defines memchr directly. */
+libc_hidden_builtin_def (memchr)
+#else
+# define memchr __memchr_neon
+#endif
+
+#define MEMCHR_NEON
+#include "memchr_impl.S"