diff options
Diffstat (limited to 'sysdeps')
-rw-r--r-- | sysdeps/aarch64/memrchr.S | 165 |
1 files changed, 165 insertions, 0 deletions
diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S new file mode 100644 index 0000000..0565168 --- /dev/null +++ b/sysdeps/aarch64/memrchr.S @@ -0,0 +1,165 @@ +/* memrchr - find the last occurrence of a byte in a memory block + + Copyright (C) 2015-2019 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 +#define cntin x2 + +#define seek_dstin x11 +#define seek_dst x12 +#define result x0 + +#define src x3 +#define tmp x4 +#define wtmp2 w5 +#define synd x6 +#define soff x9 +#define cntrem x10 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 + +#define vhas_chr1 v3 +#define vhas_chr2 v4 +#define vrepmask v5 +#define vend v6 + +/* + * Core algorithm: + * + * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits + * per byte. For each tuple, bit 0 is set if the relevant byte matched the + * requested character and bit 1 is not used (faster than using a 32bit + * syndrome). Since the bits in the syndrome reflect exactly the order in which + * things occur in the original string, counting trailing zeros allows to + * identify exactly which byte has matched. + */ + +ENTRY (__memrchr) + /* Do not dereference srcin if no bytes to compare. */ + cbz cntin, L(zero_length) + /* + * Magic constant 0x40100401 allows us to identify which lane matches + * the requested byte. + */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + /* Work with aligned 32-byte chunks */ + add seek_dstin, cntin, srcin + add tmp, seek_dstin, #31 + bic seek_dst, tmp, #31 + dup vrepmask.4s, wtmp2 + ands soff, seek_dstin, #31 + mov tmp, #32 + sub soff,tmp,soff + and cntrem, cntin, #31 + b.eq L(loop) + + /* + * Input string is not 32-byte aligned. We calculate the syndrome + * value for the aligned 32 bytes block containing the first bytes + * and mask the irrelevant part. + */ + + sub seek_dst, seek_dst, #32 + ld1 {vdata1.16b, vdata2.16b}, [seek_dst] + sub tmp, soff, #32 + adds cntin, cntin, tmp + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.2d[0] + /* Clear the (32-soff)*2 upper bits */ + lsl tmp, soff, #1 + lsl synd, synd, tmp + lsr synd, synd, tmp + /* The first block can also be the last */ + b.ls L(masklast) + /* Have we found something already? */ + cbnz synd, L(tail) + +L(loop): + sub seek_dst, seek_dst, #32 + ld1 {vdata1.16b, vdata2.16b}, [seek_dst] + subs cntin, cntin, #32 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + /* If we're out of data we finish regardless of the result */ + b.ls L(end) + /* Use a fast check for the termination condition */ + orr vend.16b, vhas_chr1.16b, vhas_chr2.16b + addp vend.2d, vend.2d, vend.2d + mov synd, vend.2d[0] + /* We're not out of data, loop if we haven't found the character */ + cbz synd, L(loop) + +L(end): + /* Termination condition found, let's calculate the syndrome value */ + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.2d[0] + /* Only do the clear for the last possible block */ + b.hi L(tail) + +L(masklast): + /* Clear the (32 - ((cntrem + (32-soff)) % 32)) * 2 lower bits */ + add tmp, cntrem, soff + and tmp, tmp, #31 + sub tmp, tmp, #32 + neg tmp, tmp, lsl #1 + lsr synd, synd, tmp + lsl synd, synd, tmp + +L(tail): + /* Compensate the last post-increment*/ + add seek_dst, seek_dst, #32 + /* Check that we have found a character */ + cmp synd, #0 + /* And count the leading zeros */ + clz synd, synd + /* Compute the potential result */ + sub result, seek_dst, synd, lsr #1 + sub result, result, #1 + /* Select result or NULL */ + csel result, xzr, result, eq + ret + +L(zero_length): + mov result, #0 + ret +END (__memrchr) +weak_alias (__memrchr, memrchr) +libc_hidden_builtin_def (memrchr) |