aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWilco Dijkstra <wdijkstr@arm.com>2020-07-17 14:09:36 +0100
committerWilco Dijkstra <wdijkstr@arm.com>2020-07-17 15:07:23 +0100
commitf46ef33ad134bec7ac992f28ee4b8b0614590e3e (patch)
treef34fe31172e393172c15b2bb189382ad9c0b17e2
parent76b8442db51a8976de19934638a42532a3af607f (diff)
downloadglibc-f46ef33ad134bec7ac992f28ee4b8b0614590e3e.zip
glibc-f46ef33ad134bec7ac992f28ee4b8b0614590e3e.tar.gz
glibc-f46ef33ad134bec7ac992f28ee4b8b0614590e3e.tar.bz2
AArch64: Improve strlen_asimd performance (bug 25824)
Optimize strlen using a mix of scalar and SIMD code. On modern micro architectures large strings are 2.6 times faster than existing strlen_asimd and 35% faster than the new MTE version of strlen. On a random strlen benchmark using small sizes the speedup is 7% vs strlen_asimd and 40% vs the MTE strlen. This fixes the main strlen regressions on Cortex-A53 and other cores with a simple Neon unit. Rename __strlen_generic to __strlen_mte, and select strlen_asimd when MTE is not enabled (this is waiting on support for a HWCAP_MTE bit). This fixes big-endian bug 25824. Passes GLIBC regression tests. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
-rw-r--r--sysdeps/aarch64/multiarch/Makefile2
-rw-r--r--sysdeps/aarch64/multiarch/ifunc-impl-list.c2
-rw-r--r--sysdeps/aarch64/multiarch/strlen.c10
-rw-r--r--sysdeps/aarch64/multiarch/strlen_asimd.S267
-rw-r--r--sysdeps/aarch64/multiarch/strlen_mte.S (renamed from sysdeps/aarch64/multiarch/strlen_generic.S)6
5 files changed, 161 insertions, 126 deletions
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 35d0978..dc3efff 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -3,5 +3,5 @@ sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx
memcpy_falkor \
memset_generic memset_falkor memset_emag memset_kunpeng \
memchr_generic memchr_nosimd \
- strlen_generic strlen_asimd
+ strlen_mte strlen_asimd
endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 4b004ac..09feea9 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -63,7 +63,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, strlen,
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)
- IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))
+ IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_mte))
return i;
}
diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c
index 99f2cf2..7c0352d 100644
--- a/sysdeps/aarch64/multiarch/strlen.c
+++ b/sysdeps/aarch64/multiarch/strlen.c
@@ -26,17 +26,15 @@
# include <string.h>
# include <init-arch.h>
-#define USE_ASIMD_STRLEN() IS_FALKOR (midr)
+/* This should check HWCAP_MTE when it is available. */
+#define MTE_ENABLED() (false)
extern __typeof (__redirect_strlen) __strlen;
-extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
+extern __typeof (__redirect_strlen) __strlen_mte attribute_hidden;
extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
-libc_ifunc (__strlen,
- (USE_ASIMD_STRLEN () || IS_KUNPENG920 (midr)
- ? __strlen_asimd
- :__strlen_generic));
+libc_ifunc (__strlen, (MTE_ENABLED () ? __strlen_mte : __strlen_asimd));
# undef strlen
strong_alias (__strlen, strlen);
diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
index 236a2c9..c7e6994 100644
--- a/sysdeps/aarch64/multiarch/strlen_asimd.S
+++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
@@ -1,4 +1,4 @@
-/* Strlen implementation that uses ASIMD instructions for load and NULL checks.
+/* Optimized strlen implementation using SIMD.
Copyright (C) 2018-2020 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -20,80 +20,90 @@
#include <sysdep.h>
/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * Not MTE compatible.
+ */
+
+#define srcin x0
+#define len x0
+
+#define src x1
+#define data1 x2
+#define data2 x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1 x4
+#define tmp2 x5
+#define tmp3 x6
+#define tmp4 x7
+#define zeroones x8
+
+#define maskv v0
+#define maskd d0
+#define dataq1 q1
+#define dataq2 q2
+#define datav1 v1
+#define datav2 v2
+#define tmp x2
+#define tmpw w2
+#define synd x3
+#define shift x4
+
+/* For the first 32 bytes, NUL detection works on the principle that
+ (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
+ byte is zero, and can be done in parallel across the entire word. */
- ARMv8-a, AArch64, ASIMD, unaligned accesses, min page size 4k. */
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
/* To test the page crossing code path more thoroughly, compile with
-DTEST_PAGE_CROSS - this will force all calls through the slower
entry path. This option is not intended for production use. */
-/* Arguments and results. */
-#define srcin x0
-#define len x0
-
-/* Locals and temporaries. */
-#define src x1
-#define data1 x2
-#define data2 x3
-#define has_nul1 x4
-#define has_nul2 x5
-#define tmp1 x4
-#define tmp2 x5
-#define tmp3 x6
-#define tmp4 x7
-#define zeroones x8
-#define dataq q2
-#define datav v2
-#define datab2 b3
-#define dataq2 q3
-#define datav2 v3
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
#ifdef TEST_PAGE_CROSS
-# define MIN_PAGE_SIZE 16
+# define MIN_PAGE_SIZE 32
#else
# define MIN_PAGE_SIZE 4096
#endif
- /* Since strings are short on average, we check the first 16 bytes
- of the string for a NUL character. In order to do an unaligned load
- safely we have to do a page cross check first. If there is a NUL
- byte we calculate the length from the 2 8-byte words using
- conditional select to reduce branch mispredictions (it is unlikely
- strlen_asimd will be repeatedly called on strings with the same
- length).
-
- If the string is longer than 16 bytes, we align src so don't need
- further page cross checks, and process 16 bytes per iteration.
-
- If the page cross check fails, we read 16 bytes from an aligned
- address, remove any characters before the string, and continue
- in the main loop using aligned loads. Since strings crossing a
- page in the first 16 bytes are rare (probability of
- 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
-
- AArch64 systems have a minimum page size of 4k. We don't bother
- checking for larger page sizes - the cost of setting up the correct
- page size is just not worth the extra gain from a small reduction in
- the cases taking the slow path. Note that we only care about
- whether the first fetch, which may be misaligned, crosses a page
- boundary. */
-
-ENTRY_ALIGN (__strlen_asimd, 6)
- DELOUSE (0)
- DELOUSE (1)
+/* Core algorithm:
+
+ Since strings are short on average, we check the first 32 bytes of the
+ string for a NUL character without aligning the string. In order to use
+ unaligned loads safely we must do a page cross check first.
+
+ If there is a NUL byte we calculate the length from the 2 8-byte words
+ using conditional select to reduce branch mispredictions (it is unlikely
+ strlen will be repeatedly called on strings with the same length).
+
+ If the string is longer than 32 bytes, align src so we don't need further
+ page cross checks, and process 32 bytes per iteration using a fast SIMD
+ loop.
+
+ If the page cross check fails, we read 32 bytes from an aligned address,
+ and ignore any characters before the string. If it contains a NUL
+ character, return the length, if not, continue in the main loop. */
+
+ENTRY (__strlen_asimd)
+ DELOUSE (0)
+
and tmp1, srcin, MIN_PAGE_SIZE - 1
- mov zeroones, REP8_01
- cmp tmp1, MIN_PAGE_SIZE - 16
- b.gt L(page_cross)
+ cmp tmp1, MIN_PAGE_SIZE - 32
+ b.hi L(page_cross)
+
+ /* Look for a NUL byte in the first 16 bytes. */
ldp data1, data2, [srcin]
+ mov zeroones, REP8_01
+
#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul1/2 directly.
+ Since we expect strings to be small and early-exit,
+ byte-swap the data now so has_null1/2 will be correct. */
rev data1, data1
rev data2, data2
#endif
-
sub tmp1, data1, zeroones
orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
@@ -101,78 +111,105 @@ ENTRY_ALIGN (__strlen_asimd, 6)
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
- beq L(main_loop_entry)
+ b.eq L(bytes16_31)
+
+ /* Find the exact offset of the first NUL byte in the first 16 bytes
+ from the string start. Enter with C = has_nul1 == 0. */
csel has_nul1, has_nul1, has_nul2, cc
mov len, 8
rev has_nul1, has_nul1
- clz tmp1, has_nul1
csel len, xzr, len, cc
+ clz tmp1, has_nul1
add len, len, tmp1, lsr 3
ret
-L(main_loop_entry):
- bic src, srcin, 15
- sub src, src, 16
-
-L(main_loop):
- ldr dataq, [src, 32]!
-L(page_cross_entry):
- /* Get the minimum value and keep going if it is not zero. */
- uminv datab2, datav.16b
- mov tmp1, datav2.d[0]
- cbz tmp1, L(tail)
- ldr dataq, [src, 16]
- uminv datab2, datav.16b
- mov tmp1, datav2.d[0]
- cbnz tmp1, L(main_loop)
- add src, src, 16
-
-L(tail):
+ .p2align 3
+ /* Look for a NUL byte at offset 16..31 in the string. */
+L(bytes16_31):
+ ldp data1, data2, [srcin, 16]
#ifdef __AARCH64EB__
- rev64 datav.16b, datav.16b
-#endif
- /* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
- pair of scalars and then compute the length from the earliest NULL
- byte. */
- cmeq datav.16b, datav.16b, #0
- mov data1, datav.d[0]
- mov data2, datav.d[1]
- cmp data1, 0
- csel data1, data1, data2, ne
- sub len, src, srcin
rev data1, data1
- add tmp2, len, 8
- clz tmp1, data1
- csel len, len, tmp2, ne
+ rev data2, data2
+#endif
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ b.eq L(loop_entry)
+
+ /* Find the exact offset of the first NUL byte at offset 16..31 from
+ the string start. Enter with C = has_nul1 == 0. */
+ csel has_nul1, has_nul1, has_nul2, cc
+ mov len, 24
+ rev has_nul1, has_nul1
+ mov tmp3, 16
+ clz tmp1, has_nul1
+ csel len, tmp3, len, cc
add len, len, tmp1, lsr 3
ret
- /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
- srcin to 0xff, so we ignore any NUL bytes before the string.
- Then continue in the aligned loop. */
-L(page_cross):
- mov tmp3, 63
- bic src, srcin, 15
- and tmp1, srcin, 7
- ands tmp2, srcin, 8
- ldr dataq, [src]
- lsl tmp1, tmp1, 3
- csel tmp2, tmp2, tmp1, eq
- csel tmp1, tmp1, tmp3, eq
- mov tmp4, -1
+L(loop_entry):
+ bic src, srcin, 31
+
+ .p2align 5
+L(loop):
+ ldp dataq1, dataq2, [src, 32]!
+ uminp maskv.16b, datav1.16b, datav2.16b
+ uminp maskv.16b, maskv.16b, maskv.16b
+ cmeq maskv.8b, maskv.8b, 0
+ fmov synd, maskd
+ cbz synd, L(loop)
+
+ /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
+ cmeq maskv.16b, datav1.16b, 0
+ sub len, src, srcin
+ tst synd, 0xffffffff
+ b.ne 1f
+ cmeq maskv.16b, datav2.16b, 0
+ add len, len, 16
+1:
+ /* Generate a bitmask and compute correct byte offset. */
#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsr tmp1, tmp4, tmp1
- lsr tmp2, tmp4, tmp2
+ bic maskv.8h, 0xf0
#else
- /* Little-endian. Early bytes are at LSB. */
- lsl tmp1, tmp4, tmp1
- lsl tmp2, tmp4, tmp2
+ bic maskv.8h, 0x0f, lsl 8
+#endif
+ umaxp maskv.16b, maskv.16b, maskv.16b
+ fmov synd, maskd
+#ifndef __AARCH64EB__
+ rbit synd, synd
#endif
- mov datav2.d[0], tmp1
- mov datav2.d[1], tmp2
- orn datav.16b, datav.16b, datav2.16b
- b L(page_cross_entry)
+ clz tmp, synd
+ add len, len, tmp, lsr 2
+ ret
+
+ .p2align 4
+
+L(page_cross):
+ bic src, srcin, 31
+ mov tmpw, 0x0c03
+ movk tmpw, 0xc030, lsl 16
+ ld1 {datav1.16b, datav2.16b}, [src]
+ dup maskv.4s, tmpw
+ cmeq datav1.16b, datav1.16b, 0
+ cmeq datav2.16b, datav2.16b, 0
+ and datav1.16b, datav1.16b, maskv.16b
+ and datav2.16b, datav2.16b, maskv.16b
+ addp maskv.16b, datav1.16b, datav2.16b
+ addp maskv.16b, maskv.16b, maskv.16b
+ fmov synd, maskd
+ lsl shift, srcin, 1
+ lsr synd, synd, shift
+ cbz synd, L(loop)
+
+ rbit synd, synd
+ clz len, synd
+ lsr len, len, 1
+ ret
+
END (__strlen_asimd)
weak_alias (__strlen_asimd, strlen_asimd)
libc_hidden_builtin_def (strlen_asimd)
diff --git a/sysdeps/aarch64/multiarch/strlen_generic.S b/sysdeps/aarch64/multiarch/strlen_mte.S
index 61d3f72..b8daa54 100644
--- a/sysdeps/aarch64/multiarch/strlen_generic.S
+++ b/sysdeps/aarch64/multiarch/strlen_mte.S
@@ -17,14 +17,14 @@
<https://www.gnu.org/licenses/>. */
/* The actual strlen code is in ../strlen.S. If we are building libc this file
- defines __strlen_generic. Otherwise the include of ../strlen.S will define
+ defines __strlen_mte. Otherwise the include of ../strlen.S will define
the normal __strlen entry points. */
#include <sysdep.h>
#if IS_IN (libc)
-# define STRLEN __strlen_generic
+# define STRLEN __strlen_mte
/* Do not hide the generic version of strlen, we use it internally. */
# undef libc_hidden_builtin_def
@@ -32,7 +32,7 @@
# ifdef SHARED
/* It doesn't make sense to send libc-internal strlen calls through a PLT. */
- .globl __GI_strlen; __GI_strlen = __strlen_generic
+ .globl __GI_strlen; __GI_strlen = __strlen_mte
# endif
#endif