diff options
author | remph <lhr@disroot.org> | 2025-09-04 12:53:56 +0000 |
---|---|---|
committer | Wilco Dijkstra <wilco.dijkstra@arm.com> | 2025-09-10 16:12:23 +0000 |
commit | e20ca759af46fbb7eae20c52b857e7636eb50e1b (patch) | |
tree | 080b12e3af5d05a8048a0b7e5a79cd13134b3ba3 | |
parent | 1a076b5c21f05066e079e6a0aa6f73935f9c0e1e (diff) | |
download | glibc-e20ca759af46fbb7eae20c52b857e7636eb50e1b.zip glibc-e20ca759af46fbb7eae20c52b857e7636eb50e1b.tar.gz glibc-e20ca759af46fbb7eae20c52b857e7636eb50e1b.tar.bz2 |
AArch64: add optimised strspn/strcspn
Requires Neon (aka. Advanced SIMD). Looks up 16 characters at a time,
for a 2-3x perfomance improvement, and a ~30% speedup on the strtok &
strsep benchtests, as tested on Cortex A-{53,72}.
Signed-off-by: remph <lhr@disroot.org>
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
-rw-r--r-- | sysdeps/aarch64/strcspn.S | 2 | ||||
-rw-r--r-- | sysdeps/aarch64/strspn.S | 146 |
2 files changed, 148 insertions, 0 deletions
diff --git a/sysdeps/aarch64/strcspn.S b/sysdeps/aarch64/strcspn.S new file mode 100644 index 0000000..f2a69e9 --- /dev/null +++ b/sysdeps/aarch64/strcspn.S @@ -0,0 +1,2 @@ +#define USE_AS_STRCSPN 1 +#include "strspn.S" diff --git a/sysdeps/aarch64/strspn.S b/sysdeps/aarch64/strspn.S new file mode 100644 index 0000000..edbb705 --- /dev/null +++ b/sysdeps/aarch64/strspn.S @@ -0,0 +1,146 @@ +/* Copyright (C) 2025 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#ifdef USE_AS_STRCSPN +# define STRSPN strcspn +# define SBT orr /* SBT -- `set bit' */ +#else +# define STRSPN strspn +# define SBT bic +#endif + +#ifdef __AARCH64EB__ +# define LS_FW lsl +# define LS_BK lsr +#else +# define LS_FW lsr +# define LS_BK lsl +#endif + +#define og_s x0 +#define set x1 /* ACCEPT for strspn, REJECT for strcspn */ + +#define byte_i x3 +#define bits_i x4 +#define one x6 + +#define syndrome x5 +#define s x6 + +#define vbyte_i v1.16b +#define vbits_i v2.16b +#define table v4.16b-v5.16b +#define table_a v4 +#define table_b v5 +#define sevens v7.16b + +ENTRY(STRSPN) + ldrb w2, [set] + cbz w2, L(early) +#ifdef USE_AS_STRCSPN + ldrb w3, [set, 1] + cbz w3, L(early) +#endif + + /* Table has ones for bytes to reject and zeros for bytes to accept */ + mov one, 1 +#ifdef USE_AS_STRCSPN + stp one, xzr, [sp, -32]! + .cfi_def_cfa_offset 32 + stp xzr, xzr, [sp, 16] +#else + mvni v0.4s, 0 + stp q0, q0, [sp, -32]! + .cfi_def_cfa_offset 32 +#endif + + .p2align 4 +L(fill_table): + lsr byte_i, x2, 6 /* x2 / 64 */ + lsl bits_i, one, x2 /* x2 % 64 implicitly */ + ldrb w2, [set, 1]! + ldr x5, [sp, byte_i, lsl 3] + SBT x5, x5, bits_i + str x5, [sp, byte_i, lsl 3] + cbnz w2, L(fill_table) + + ld1 {table_a.2d-table_b.2d}, [sp], 32 + .cfi_def_cfa_offset 0 + ubfiz syndrome, og_s, 2, 4 /* Bottom 4 bits, times 4 to count nibbles */ + and s, og_s, -16 /* Round S down to 16-byte boundary */ + movi sevens, 7 + /* Bias the syndrome to mask off these nibbles */ + mov x8, -1 + LS_BK syndrome, x8, syndrome + mvn syndrome, syndrome + +L(loop): + ldr q0, [s], 16 + ushr vbyte_i, v0.16b, 3 + bic vbits_i, sevens, v0.16b + tbl v0.16b, {table}, vbyte_i + /* Bring the relevant bit to the MSB of each byte */ + sshl v0.16b, v0.16b, vbits_i + /* Set every bit of each byte to its MSB */ + cmlt v0.16b, v0.16b, 0 + /* Bytes->nibbles */ + shrn v0.8b, v0.8h, 4 + fmov x2, d0 + bic syndrome, x2, syndrome + cbz syndrome, L(loop) + +#ifndef __AARCH64EB__ + rbit syndrome, syndrome +#endif + sub s, s, 16 + clz syndrome, syndrome + sub x0, s, og_s + add x0, x0, syndrome, lsr 2 + ret + + .balign 8 /* For strspn, which has only 2 instructions here */ +L(early): +#ifdef USE_AS_STRCSPN + /* strlen(set) < 2: call strchrnul(s, *set) and get its offset from S */ + stp fp, lr, [sp, -32]! + .cfi_def_cfa_offset 32 + .cfi_offset fp, -32 + .cfi_offset lr, -24 + str x19, [sp, 16] + .cfi_offset 19, -16 + mov w1, w2 + mov fp, sp + mov x19, x0 + bl __strchrnul + sub x0, x0, x19 + ldr x19, [sp, 16] + ldp fp, lr, [sp], 32 + .cfi_restore lr + .cfi_restore fp + .cfi_restore 19 + .cfi_def_cfa_offset 0 +#else + mov w0, 0 +#endif + ret +END(STRSPN) + +#undef set +libc_hidden_def(STRSPN) |