aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/aarch64/strchrnul.S
diff options
context:
space:
mode:
authorDanila Kutenin <danilak@google.com>2022-06-27 16:12:13 +0000
committerSzabolcs Nagy <szabolcs.nagy@arm.com>2022-07-06 09:26:20 +0100
commit3c9980698988ef64072f1fac339b180f52792faf (patch)
tree3c32dabb3fcbfa564647fcedd9be5c7674a30fc2 /sysdeps/aarch64/strchrnul.S
parentbd0b58837c7df091046e7531642f379a52e1e157 (diff)
downloadglibc-3c9980698988ef64072f1fac339b180f52792faf.zip
glibc-3c9980698988ef64072f1fac339b180f52792faf.tar.gz
glibc-3c9980698988ef64072f1fac339b180f52792faf.tar.bz2
aarch64: Optimize string functions with shrn instruction
We found that string functions were using AND+ADDP to find the nibble/syndrome mask but there is an easier opportunity through `SHRN dst.8b, src.8h, 4` (shift right every 2 bytes by 4 and narrow to 1 byte) and has same latency on all SIMD ARMv8 targets as ADDP. There are also possible gaps for memcmp but that's for another patch. We see 10-20% savings for small-mid size cases (<=128) which are primary cases for general workloads.
Diffstat (limited to 'sysdeps/aarch64/strchrnul.S')
-rw-r--r--sysdeps/aarch64/strchrnul.S29
1 files changed, 11 insertions, 18 deletions
diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
index 442726f..ee154ab 100644
--- a/sysdeps/aarch64/strchrnul.S
+++ b/sysdeps/aarch64/strchrnul.S
@@ -33,38 +33,32 @@
#define src x2
#define tmp1 x1
#define tmp2 x3
-#define tmp2w w3
#define vrepchr v0
#define vdata v1
#define qdata q1
#define vhas_nul v2
#define vhas_chr v3
-#define vrepmask v4
-#define vend v5
-#define dend d5
+#define vend v4
+#define dend d4
-/* Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+/*
+ Core algorithm:
+ For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+ per byte. We take 4 bits of every comparison byte with shift right and narrow
+ by 4 instruction. Since the bits in the nibble mask reflect the order in
+ which things occur in the original string, counting leading zeros identifies
+ exactly which byte matched. */
ENTRY (__strchrnul)
PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
ld1 {vdata.16b}, [src]
- mov tmp2w, 0xf00f
- dup vrepmask.8h, tmp2w
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
lsl tmp2, srcin, 2
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov tmp1, dend
lsr tmp1, tmp1, tmp2 /* Mask padding bits. */
cbz tmp1, L(loop)
@@ -83,8 +77,7 @@ L(loop):
fmov tmp1, dend
cbz tmp1, L(loop)
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov tmp1, dend
#ifndef __AARCH64EB__
rbit tmp1, tmp1