aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXuelei Zhang <zhangxuelei4@huawei.com>2019-12-19 13:49:46 +0000
committerAdhemerval Zanella <adhemerval.zanella@linaro.org>2019-12-19 16:31:04 -0300
commit2911cb68ed3d6c515ad1979237e74e1fefab3674 (patch)
treee7a53b39337a460ef833279f1d0a3c2ab54177fb
parent0237b61526e716fa9597f521643908a4fda3b46a (diff)
downloadglibc-2911cb68ed3d6c515ad1979237e74e1fefab3674.zip
glibc-2911cb68ed3d6c515ad1979237e74e1fefab3674.tar.gz
glibc-2911cb68ed3d6c515ad1979237e74e1fefab3674.tar.bz2
aarch64: Optimized implementation of strnlen
Optimize the strlen implementation by using vector operations and loop unrooling in main loop. Compared to aarch64/strnlen.S, it reduces latency of cases in bench-strnlen by 11%~24% when the length of src is greater than 64 bytes, with gains throughout the benchmark. Checked on aarch64-linux-gnu. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
-rw-r--r--sysdeps/aarch64/strnlen.S52
1 files changed, 51 insertions, 1 deletions
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index 70283c8..a57753b 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -45,6 +45,11 @@
#define pos x13
#define limit_wd x14
+#define dataq q2
+#define datav v2
+#define datab2 b3
+#define dataq2 q3
+#define datav2 v3
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
@@ -71,7 +76,7 @@ ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9)
cycle, as we get much better parallelism out of the operations. */
/* Start of critial section -- keep to one 64Byte cache line. */
-L(loop):
+
ldp data1, data2, [src], #16
L(realigned):
sub tmp1, data1, zeroones
@@ -119,6 +124,51 @@ L(nul_in_data2):
csel len, len, limit, ls /* Return the lower value. */
RET
+L(loop):
+ ldr dataq, [src], #16
+ uminv datab2, datav.16b
+ mov tmp1, datav2.d[0]
+ subs limit_wd, limit_wd, #1
+ ccmp tmp1, #0, #4, pl /* NZCV = 0000 */
+ b.eq L(loop_end)
+ ldr dataq, [src], #16
+ uminv datab2, datav.16b
+ mov tmp1, datav2.d[0]
+ subs limit_wd, limit_wd, #1
+ ccmp tmp1, #0, #4, pl /* NZCV = 0000 */
+ b.ne L(loop)
+L(loop_end):
+ /* End of critical section -- keep to one 64Byte cache line. */
+
+ cbnz tmp1, L(hit_limit) /* No null in final Qword. */
+
+ /* We know there's a null in the final Qword. The easiest thing
+ to do now is work out the length of the string and return
+ MIN (len, limit). */
+
+#ifdef __AARCH64EB__
+ rev64 datav.16b, datav.16b
+#endif
+ /* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
+ pair of scalars and then compute the length from the earliest NULL
+ byte. */
+
+ cmeq datav.16b, datav.16b, #0
+ mov data1, datav.d[0]
+ mov data2, datav.d[1]
+ cmp data1, 0
+ csel data1, data1, data2, ne
+ sub len, src, srcin
+ sub len, len, #16
+ rev data1, data1
+ add tmp2, len, 8
+ clz tmp1, data1
+ csel len, len, tmp2, ne
+ add len, len, tmp1, lsr 3
+ cmp len, limit
+ csel len, len, limit, ls /* Return the lower value. */
+ RET
+
L(misaligned):
/* Deal with a partial first word.
We're doing two things in parallel here;