aboutsummaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/aarch64/memrchr.S165
1 files changed, 165 insertions, 0 deletions
diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
new file mode 100644
index 0000000..0565168
--- /dev/null
+++ b/sysdeps/aarch64/memrchr.S
@@ -0,0 +1,165 @@
+/* memrchr - find the last occurrence of a byte in a memory block
+
+ Copyright (C) 2015-2019 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+#define cntin x2
+
+#define seek_dstin x11
+#define seek_dst x12
+#define result x0
+
+#define src x3
+#define tmp x4
+#define wtmp2 w5
+#define synd x6
+#define soff x9
+#define cntrem x10
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask v5
+#define vend v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memrchr)
+ /* Do not dereference srcin if no bytes to compare. */
+ cbz cntin, L(zero_length)
+ /*
+ * Magic constant 0x40100401 allows us to identify which lane matches
+ * the requested byte.
+ */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ /* Work with aligned 32-byte chunks */
+ add seek_dstin, cntin, srcin
+ add tmp, seek_dstin, #31
+ bic seek_dst, tmp, #31
+ dup vrepmask.4s, wtmp2
+ ands soff, seek_dstin, #31
+ mov tmp, #32
+ sub soff,tmp,soff
+ and cntrem, cntin, #31
+ b.eq L(loop)
+
+ /*
+ * Input string is not 32-byte aligned. We calculate the syndrome
+ * value for the aligned 32 bytes block containing the first bytes
+ * and mask the irrelevant part.
+ */
+
+ sub seek_dst, seek_dst, #32
+ ld1 {vdata1.16b, vdata2.16b}, [seek_dst]
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Clear the (32-soff)*2 upper bits */
+ lsl tmp, soff, #1
+ lsl synd, synd, tmp
+ lsr synd, synd, tmp
+ /* The first block can also be the last */
+ b.ls L(masklast)
+ /* Have we found something already? */
+ cbnz synd, L(tail)
+
+L(loop):
+ sub seek_dst, seek_dst, #32
+ ld1 {vdata1.16b, vdata2.16b}, [seek_dst]
+ subs cntin, cntin, #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* If we're out of data we finish regardless of the result */
+ b.ls L(end)
+ /* Use a fast check for the termination condition */
+ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend.2d, vend.2d, vend.2d
+ mov synd, vend.2d[0]
+ /* We're not out of data, loop if we haven't found the character */
+ cbz synd, L(loop)
+
+L(end):
+ /* Termination condition found, let's calculate the syndrome value */
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Only do the clear for the last possible block */
+ b.hi L(tail)
+
+L(masklast):
+ /* Clear the (32 - ((cntrem + (32-soff)) % 32)) * 2 lower bits */
+ add tmp, cntrem, soff
+ and tmp, tmp, #31
+ sub tmp, tmp, #32
+ neg tmp, tmp, lsl #1
+ lsr synd, synd, tmp
+ lsl synd, synd, tmp
+
+L(tail):
+ /* Compensate the last post-increment*/
+ add seek_dst, seek_dst, #32
+ /* Check that we have found a character */
+ cmp synd, #0
+ /* And count the leading zeros */
+ clz synd, synd
+ /* Compute the potential result */
+ sub result, seek_dst, synd, lsr #1
+ sub result, result, #1
+ /* Select result or NULL */
+ csel result, xzr, result, eq
+ ret
+
+L(zero_length):
+ mov result, #0
+ ret
+END (__memrchr)
+weak_alias (__memrchr, memrchr)
+libc_hidden_builtin_def (memrchr)