aarch64: Improve strcmp unaligned performance

Replace the simple byte-wise compare in the misaligned case with a dword compare with page boundary checks in place. For simplicity I've chosen a 4K page boundary so that we don't have to query the actual page size on the system. This results in up to 3x improvement in performance in the unaligned case on falkor and about 2.5x improvement on mustang as measured using bench-strcmp. * sysdeps/aarch64/strcmp.S (misaligned8): Compare dword at a time whenever possible. (cherry picked from commit 2bce01ebbaf8db52ba4a5635eb5744f989cdbf69)
author: Siddhesh Poyarekar <siddhesh@sourceware.org> 2017-12-13 18:50:27 +0530
committer: Wilco Dijkstra <wdijkstr@arm.com> 2019-09-06 17:13:02 +0100
commit: 4e75091d6ce3f7ac8b1750ca6135bc37d6707caf (patch)
tree: 82fdb5df1150aeeb72cff2ad6b354d719f854e90
parent: 8569357e11aa7b8e912142727eac1d106c785433 (diff)
download: glibc-4e75091d6ce3f7ac8b1750ca6135bc37d6707caf.zip
glibc-4e75091d6ce3f7ac8b1750ca6135bc37d6707caf.tar.gz
glibc-4e75091d6ce3f7ac8b1750ca6135bc37d6707caf.tar.bz2
2 files changed, 34 insertions, 2 deletions
diff --git a/ChangeLog b/ChangeLog
index 18a01ed..cd0c1db 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
 2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
 
+	* sysdeps/aarch64/strcmp.S (misaligned8): Compare dword at a
+	time whenever possible.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
 	* sysdeps/aarch64/memcmp.S (more16): Fix loop16 branch target.
 
 	* sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a
diff --git a/sysdeps/aarch64/strcmp.S b/sysdeps/aarch64/strcmp.S
index e99d662..c260e1d 100644
--- a/sysdeps/aarch64/strcmp.S
+++ b/sysdeps/aarch64/strcmp.S
@@ -72,6 +72,7 @@ L(start_realigned):
 	cbz	syndrome, L(loop_aligned)
 	/* End of performance-critical section  -- one 64B cache line.  */
 
+L(end):
 #ifndef	__AARCH64EB__
 	rev	syndrome, syndrome
 	rev	data1, data1
@@ -145,12 +146,38 @@ L(mutual_align):
 	b	L(start_realigned)
 
 L(misaligned8):
-	/* We can do better than this.  */
+	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+	   checking to make sure that we don't access beyond page boundary in
+	   SRC2.  */
+	tst	src1, #7
+	b.eq	L(loop_misaligned)
+L(do_misaligned):
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	cmp	data1w, #1
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	L(misaligned8)
+	b.ne	L(done)
+	tst	src1, #7
+	b.ne	L(misaligned8)
+
+L(loop_misaligned):
+	/* Test if we are within the last dword of the end of a 4K page.  If
+	   yes then jump back to the misaligned loop to copy a byte at a time.  */
+	and	tmp1, src2, #0xff8
+	eor	tmp1, tmp1, #0xff8
+	cbz	tmp1, L(do_misaligned)
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	orr	syndrome, diff, has_nul
+	cbz	syndrome, L(loop_misaligned)
+	b	L(end)
+
+L(done):
 	sub	result, data1, data2
 	RET
 END(strcmp)
author	Siddhesh Poyarekar <siddhesh@sourceware.org>	2017-12-13 18:50:27 +0530
committer	Wilco Dijkstra <wdijkstr@arm.com>	2019-09-06 17:13:02 +0100
commit	4e75091d6ce3f7ac8b1750ca6135bc37d6707caf (patch)
tree	82fdb5df1150aeeb72cff2ad6b354d719f854e90
parent	8569357e11aa7b8e912142727eac1d106c785433 (diff)
download	glibc-4e75091d6ce3f7ac8b1750ca6135bc37d6707caf.zip glibc-4e75091d6ce3f7ac8b1750ca6135bc37d6707caf.tar.gz glibc-4e75091d6ce3f7ac8b1750ca6135bc37d6707caf.tar.bz2