From 3972a306be01e02f8db61c60617f8421371658e0 Mon Sep 17 00:00:00 2001 From: Wilco Dijkstra Date: Wed, 11 Jan 2023 13:53:19 +0000 Subject: AArch64: Improve strrchr Use shrn for narrowing the mask which simplifies code and speeds up small strings. Unroll the first search loop to improve performance on large strings. Reviewed-by: Szabolcs Nagy (cherry picked from commit 55599d480437dcf129b41b95be32b48f2a9e5da9) --- sysdeps/aarch64/strrchr.S | 58 +++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S index dc30643..de0b37a 100644 --- a/sysdeps/aarch64/strrchr.S +++ b/sysdeps/aarch64/strrchr.S @@ -22,19 +22,16 @@ /* Assumptions: * - * ARMv8-a, AArch64 - * Neon Available. + * ARMv8-a, AArch64, Advanced SIMD. * MTE compatible. */ -/* Arguments and results. */ #define srcin x0 #define chrin w1 #define result x0 #define src x2 #define tmp x3 -#define wtmp w3 #define synd x3 #define shift x4 #define src_match x4 @@ -46,7 +43,6 @@ #define vhas_nul v2 #define vhas_chr v3 #define vrepmask v4 -#define vrepmask2 v5 #define vend v5 #define dend d5 @@ -58,59 +54,71 @@ the relevant byte matched the requested character; bits 2-3 are set if the relevant byte matched the NUL end of string. */ -ENTRY(strrchr) +ENTRY (strrchr) PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin - mov wtmp, 0x3003 - dup vrepmask.8h, wtmp - tst srcin, 15 - beq L(loop1) - - ld1 {vdata.16b}, [src], 16 + movi vrepmask.16b, 0x33 + ld1 {vdata.16b}, [src] cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - mov wtmp, 0xf00f - dup vrepmask2.8h, wtmp bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 lsl shift, srcin, 2 fmov synd, dend lsr synd, synd, shift lsl synd, synd, shift ands nul_match, synd, 0xcccccccccccccccc bne L(tail) - cbnz synd, L(loop2) + cbnz synd, L(loop2_start) - .p2align 5 + .p2align 4 L(loop1): - ld1 {vdata.16b}, [src], 16 + ldr q1, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loop1_end) + ldr q1, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend cbz synd, L(loop1) - + sub src, src, 16 +L(loop1_end): + add src, src, 16 cmeq vhas_nul.16b, vdata.16b, 0 +#ifdef __AARCH64EB__ + bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + rbit synd, synd +#else bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - bic vhas_nul.8h, 0x0f, lsl 8 - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 fmov synd, dend +#endif ands nul_match, synd, 0xcccccccccccccccc - beq L(loop2) - + beq L(loop2_start) L(tail): sub nul_match, nul_match, 1 and chr_match, synd, 0x3333333333333333 ands chr_match, chr_match, nul_match - sub result, src, 1 + add result, src, 15 clz tmp, chr_match sub result, result, tmp, lsr 2 csel result, result, xzr, ne ret .p2align 4 + nop + nop +L(loop2_start): + add src, src, 16 + bic vrepmask.8h, 0xf0 + L(loop2): cmp synd, 0 csel src_match, src, src_match, ne -- cgit v1.1