aboutsummaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
authorSiddhesh Poyarekar <siddhesh@sourceware.org>2017-12-13 18:50:27 +0530
committerSiddhesh Poyarekar <siddhesh@sourceware.org>2017-12-13 18:50:27 +0530
commit2bce01ebbaf8db52ba4a5635eb5744f989cdbf69 (patch)
treeebe39cd2ac5534dccbda2169eb1753d3781df178 /sysdeps
parent243b63337c2c02f30ec3a988ecc44bc0f6ffa0ad (diff)
downloadglibc-2bce01ebbaf8db52ba4a5635eb5744f989cdbf69.zip
glibc-2bce01ebbaf8db52ba4a5635eb5744f989cdbf69.tar.gz
glibc-2bce01ebbaf8db52ba4a5635eb5744f989cdbf69.tar.bz2
aarch64: Improve strcmp unaligned performance
Replace the simple byte-wise compare in the misaligned case with a dword compare with page boundary checks in place. For simplicity I've chosen a 4K page boundary so that we don't have to query the actual page size on the system. This results in up to 3x improvement in performance in the unaligned case on falkor and about 2.5x improvement on mustang as measured using bench-strcmp. * sysdeps/aarch64/strcmp.S (misaligned8): Compare dword at a time whenever possible.
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/aarch64/strcmp.S31
1 files changed, 29 insertions, 2 deletions
diff --git a/sysdeps/aarch64/strcmp.S b/sysdeps/aarch64/strcmp.S
index e99d662..c260e1d 100644
--- a/sysdeps/aarch64/strcmp.S
+++ b/sysdeps/aarch64/strcmp.S
@@ -72,6 +72,7 @@ L(start_realigned):
cbz syndrome, L(loop_aligned)
/* End of performance-critical section -- one 64B cache line. */
+L(end):
#ifndef __AARCH64EB__
rev syndrome, syndrome
rev data1, data1
@@ -145,12 +146,38 @@ L(mutual_align):
b L(start_realigned)
L(misaligned8):
- /* We can do better than this. */
+ /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+ checking to make sure that we don't access beyond page boundary in
+ SRC2. */
+ tst src1, #7
+ b.eq L(loop_misaligned)
+L(do_misaligned):
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
cmp data1w, #1
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq L(misaligned8)
+ b.ne L(done)
+ tst src1, #7
+ b.ne L(misaligned8)
+
+L(loop_misaligned):
+ /* Test if we are within the last dword of the end of a 4K page. If
+ yes then jump back to the misaligned loop to copy a byte at a time. */
+ and tmp1, src2, #0xff8
+ eor tmp1, tmp1, #0xff8
+ cbz tmp1, L(do_misaligned)
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ orr syndrome, diff, has_nul
+ cbz syndrome, L(loop_misaligned)
+ b L(end)
+
+L(done):
sub result, data1, data2
RET
END(strcmp)