aboutsummaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
authorWilco Dijkstra <wilco.dijkstra@arm.com>2021-07-01 15:30:42 +0100
committerWilco Dijkstra <wdijkstr@arm.com>2021-07-01 15:32:36 +0100
commit252cad02d4c63540501b9b8c988cb91248563224 (patch)
treef43c836158ad49a84b5b348d241d59eb97a9bd0a /sysdeps
parenteb68d7d23cc411acdf68a60f194343a6774d6194 (diff)
downloadglibc-252cad02d4c63540501b9b8c988cb91248563224.zip
glibc-252cad02d4c63540501b9b8c988cb91248563224.tar.gz
glibc-252cad02d4c63540501b9b8c988cb91248563224.tar.bz2
AArch64: Improve strnlen performance
Optimize strnlen by avoiding UMINV which is slow on most cores. On Neoverse N1 large strings are 1.8x faster than the current version, and bench-strnlen is 50% faster overall. This version is MTE compatible. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/aarch64/strnlen.S270
1 files changed, 89 insertions, 181 deletions
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index 2b57575..37e9eed 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -22,197 +22,105 @@
/* Assumptions:
*
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
*/
-/* Arguments and results. */
#define srcin x0
-#define len x0
-#define limit x1
+#define cntin x1
+#define result x0
-/* Locals and temporaries. */
#define src x2
-#define data1 x3
-#define data2 x4
-#define data2a x5
-#define has_nul1 x6
-#define has_nul2 x7
-#define tmp1 x8
-#define tmp2 x9
-#define tmp3 x10
-#define tmp4 x11
-#define zeroones x12
-#define pos x13
-#define limit_wd x14
-
-#define dataq q2
-#define datav v2
-#define datab2 b3
-#define dataq2 q3
-#define datav2 v3
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
-ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9)
+#define synd x3
+#define shift x4
+#define wtmp w4
+#define tmp x4
+#define cntrem x5
+
+#define qdata q0
+#define vdata v0
+#define vhas_chr v1
+#define vrepmask v2
+#define vend v3
+#define dend d3
+
+/*
+ Core algorithm:
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
+
+ENTRY (__strnlen)
PTR_ARG (0)
SIZE_ARG (1)
- cbz limit, L(hit_limit)
- mov zeroones, #REP8_01
- bic src, srcin, #15
- ands tmp1, srcin, #15
- b.ne L(misaligned)
- /* Calculate the number of full and partial words -1. */
- sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
- lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
-
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. */
- /* The inner loop deals with two Dwords at a time. This has a
- slightly higher start-up cost, but we should win quite quickly,
- especially on cores with a high number of issue slots per
- cycle, as we get much better parallelism out of the operations. */
-
- /* Start of critial section -- keep to one 64Byte cache line. */
-
- ldp data1, data2, [src], #16
-L(realigned):
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
- bic has_nul1, tmp1, tmp2
- bic has_nul2, tmp3, tmp4
- subs limit_wd, limit_wd, #1
- orr tmp1, has_nul1, has_nul2
- ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
- b.eq L(loop)
- /* End of critical section -- keep to one 64Byte cache line. */
-
- orr tmp1, has_nul1, has_nul2
- cbz tmp1, L(hit_limit) /* No null in final Qword. */
-
- /* We know there's a null in the final Qword. The easiest thing
- to do now is work out the length of the string and return
- MIN (len, limit). */
-
- sub len, src, srcin
- cbz has_nul1, L(nul_in_data2)
-#ifdef __AARCH64EB__
- mov data2, data1
+ bic src, srcin, 15
+ mov wtmp, 0xf00f
+ cbz cntin, L(nomatch)
+ ld1 {vdata.16b}, [src], 16
+ dup vrepmask.8h, wtmp
+ cmeq vhas_chr.16b, vdata.16b, 0
+ lsl shift, srcin, 2
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ lsr synd, synd, shift
+ cbz synd, L(start_loop)
+L(finish):
+ rbit synd, synd
+ clz synd, synd
+ lsr result, synd, 2
+ cmp cntin, result
+ csel result, cntin, result, ls
+ ret
+
+L(start_loop):
+ sub tmp, src, srcin
+ subs cntrem, cntin, tmp
+ b.ls L(nomatch)
+
+ /* Make sure that it won't overread by a 16-byte chunk */
+ add tmp, cntrem, 15
+ tbnz tmp, 4, L(loop32_2)
+
+ .p2align 5
+L(loop32):
+ ldr qdata, [src], 16
+ cmeq vhas_chr.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ cbnz synd, L(end)
+L(loop32_2):
+ ldr qdata, [src], 16
+ subs cntrem, cntrem, 32
+ cmeq vhas_chr.16b, vdata.16b, 0
+ b.ls L(end)
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ cbz synd, L(loop32)
+
+L(end):
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ sub src, src, 16
+ mov synd, vend.d[0]
+ sub result, src, srcin
+#ifndef __AARCH64EB__
+ rbit synd, synd
#endif
- sub len, len, #8
- mov has_nul2, has_nul1
-L(nul_in_data2):
-#ifdef __AARCH64EB__
- /* For big-endian, carry propagation (if the final byte in the
- string is 0x01) means we cannot use has_nul directly. The
- easiest way to get the correct byte is to byte-swap the data
- and calculate the syndrome a second time. */
- rev data2, data2
- sub tmp1, data2, zeroones
- orr tmp2, data2, #REP8_7f
- bic has_nul2, tmp1, tmp2
-#endif
- sub len, len, #8
- rev has_nul2, has_nul2
- clz pos, has_nul2
- add len, len, pos, lsr #3 /* Bits to bytes. */
- cmp len, limit
- csel len, len, limit, ls /* Return the lower value. */
- RET
-
-L(loop):
- ldr dataq, [src], #16
- uminv datab2, datav.16b
- mov tmp1, datav2.d[0]
- subs limit_wd, limit_wd, #1
- ccmp tmp1, #0, #4, pl /* NZCV = 0000 */
- b.eq L(loop_end)
- ldr dataq, [src], #16
- uminv datab2, datav.16b
- mov tmp1, datav2.d[0]
- subs limit_wd, limit_wd, #1
- ccmp tmp1, #0, #4, pl /* NZCV = 0000 */
- b.ne L(loop)
-L(loop_end):
- /* End of critical section -- keep to one 64Byte cache line. */
-
- cbnz tmp1, L(hit_limit) /* No null in final Qword. */
-
- /* We know there's a null in the final Qword. The easiest thing
- to do now is work out the length of the string and return
- MIN (len, limit). */
-
-#ifdef __AARCH64EB__
- rev64 datav.16b, datav.16b
-#endif
- /* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
- pair of scalars and then compute the length from the earliest NULL
- byte. */
-
- cmeq datav.16b, datav.16b, #0
-#ifdef __AARCH64EB__
- mov data1, datav.d[1]
- mov data2, datav.d[0]
-#else
- mov data1, datav.d[0]
- mov data2, datav.d[1]
-#endif
- cmp data1, 0
- csel data1, data1, data2, ne
- sub len, src, srcin
- sub len, len, #16
- rev data1, data1
- add tmp2, len, 8
- clz tmp1, data1
- csel len, len, tmp2, ne
- add len, len, tmp1, lsr 3
- cmp len, limit
- csel len, len, limit, ls /* Return the lower value. */
- RET
-
-L(misaligned):
- /* Deal with a partial first word.
- We're doing two things in parallel here;
- 1) Calculate the number of words (but avoiding overflow if
- limit is near ULONG_MAX) - to do this we need to work out
- limit + tmp1 - 1 as a 65-bit value before shifting it;
- 2) Load and mask the initial data words - we force the bytes
- before the ones we are interested in to 0xff - this ensures
- early bytes will not hit any zero detection. */
- sub limit_wd, limit, #1
- neg tmp4, tmp1
- cmp tmp1, #8
-
- and tmp3, limit_wd, #15
- lsr limit_wd, limit_wd, #4
- mov tmp2, #~0
-
- ldp data1, data2, [src], #16
- lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
- add tmp3, tmp3, tmp1
-
-#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
-#else
- /* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
-#endif
- add limit_wd, limit_wd, tmp3, lsr #4
-
- orr data1, data1, tmp2
- orr data2a, data2, tmp2
+ clz synd, synd
+ add result, result, synd, lsr 2
+ cmp cntin, result
+ csel result, cntin, result, ls
+ ret
- csinv data1, data1, xzr, le
- csel data2, data2, data2a, le
- b L(realigned)
+L(nomatch):
+ mov result, cntin
+ ret
-L(hit_limit):
- mov len, limit
- RET
END (__strnlen)
libc_hidden_def (__strnlen)
weak_alias (__strnlen, strnlen)