aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWilco Dijkstra <wdijkstr@arm.com>2021-12-02 18:30:55 +0000
committerWilco Dijkstra <wdijkstr@arm.com>2021-12-02 18:35:53 +0000
commitb51eb35c572b015641f03e3682c303f7631279b7 (patch)
tree3d430053c36fd853ed02f96873c215406eba7e47
parentd120fb9941be1fb1934f0b50c6ad64e4c5e404fb (diff)
downloadglibc-b51eb35c572b015641f03e3682c303f7631279b7.zip
glibc-b51eb35c572b015641f03e3682c303f7631279b7.tar.gz
glibc-b51eb35c572b015641f03e3682c303f7631279b7.tar.bz2
AArch64: Optimize memcmp
Rewrite memcmp to improve performance. On small and medium inputs performance is 10-20% better. Large inputs use a SIMD loop processing 64 bytes per iteration, which is 30-50% faster depending on the size. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
-rw-r--r--sysdeps/aarch64/memcmp.S241
1 files changed, 134 insertions, 107 deletions
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index 5dc4ee5..f734bc8 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -22,105 +22,79 @@
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*/
-/* Parameters and result. */
-#define src1 x0
-#define src2 x1
-#define limit x2
-#define result w0
-
-/* Internal variables. */
-#define data1 x3
-#define data1w w3
-#define data1h x4
-#define data2 x5
-#define data2w w5
-#define data2h x6
-#define tmp1 x7
-#define tmp2 x8
-
-ENTRY_ALIGN (memcmp, 6)
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result w0
+
+#define data1 x3
+#define data1w w3
+#define data2 x4
+#define data2w w4
+#define data3 x5
+#define data3w w5
+#define data4 x6
+#define data4w w6
+#define tmp x6
+#define src1end x7
+#define src2end x8
+
+
+ENTRY (memcmp)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
- subs limit, limit, 16
+ cmp limit, 16
b.lo L(less16)
-
- ldp data1, data1h, [src1], 16
- ldp data2, data2h, [src2], 16
+ ldp data1, data3, [src1]
+ ldp data2, data4, [src2]
ccmp data1, data2, 0, ne
- ccmp data1h, data2h, 0, eq
- b.ne L(return64)
+ ccmp data3, data4, 0, eq
+ b.ne L(return2)
- subs limit, limit, 16
+ add src1end, src1, limit
+ add src2end, src2, limit
+ cmp limit, 32
b.ls L(last_bytes)
- cmp limit, 112
- b.lo L(loop16)
-
- and tmp1, src1, 15
- add limit, limit, tmp1
- sub src1, src1, tmp1
- sub src2, src2, tmp1
- subs limit, limit, 48
+ cmp limit, 160
+ b.hs L(loop_align)
+ sub limit, limit, 32
- /* Compare 128 up bytes using aligned access. */
.p2align 4
-L(loop64):
- ldp data1, data1h, [src1]
- ldp data2, data2h, [src2]
- cmp data1, data2
- ccmp data1h, data2h, 0, eq
- b.ne L(return64)
-
- ldp data1, data1h, [src1, 16]
- ldp data2, data2h, [src2, 16]
- cmp data1, data2
- ccmp data1h, data2h, 0, eq
- b.ne L(return64)
-
- ldp data1, data1h, [src1, 32]
- ldp data2, data2h, [src2, 32]
- cmp data1, data2
- ccmp data1h, data2h, 0, eq
- b.ne L(return64)
-
- ldp data1, data1h, [src1, 48]
- ldp data2, data2h, [src2, 48]
+L(loop32):
+ ldp data1, data3, [src1, 16]
+ ldp data2, data4, [src2, 16]
cmp data1, data2
- ccmp data1h, data2h, 0, eq
- b.ne L(return64)
+ ccmp data3, data4, 0, eq
+ b.ne L(return2)
+ cmp limit, 16
+ b.ls L(last_bytes)
- subs limit, limit, 64
- add src1, src1, 64
- add src2, src2, 64
- b.pl L(loop64)
- adds limit, limit, 48
- b.lo L(last_bytes)
-
-L(loop16):
- ldp data1, data1h, [src1], 16
- ldp data2, data2h, [src2], 16
+ ldp data1, data3, [src1, 32]
+ ldp data2, data4, [src2, 32]
cmp data1, data2
- ccmp data1h, data2h, 0, eq
- b.ne L(return64)
+ ccmp data3, data4, 0, eq
+ b.ne L(return2)
+ add src1, src1, 32
+ add src2, src2, 32
+L(last64):
+ subs limit, limit, 32
+ b.hi L(loop32)
- subs limit, limit, 16
- b.hi L(loop16)
/* Compare last 1-16 bytes using unaligned access. */
L(last_bytes):
- add src1, src1, limit
- add src2, src2, limit
- ldp data1, data1h, [src1]
- ldp data2, data2h, [src2]
+ ldp data1, data3, [src1end, -16]
+ ldp data2, data4, [src2end, -16]
+L(return2):
+ cmp data1, data2
+ csel data1, data1, data3, ne
+ csel data2, data2, data4, ne
/* Compare data bytes and set return value to 0, -1 or 1. */
-L(return64):
- cmp data1, data2
- csel data1, data1, data1h, ne
- csel data2, data2, data2h, ne
L(return):
#ifndef __AARCH64EB__
rev data1, data1
@@ -133,45 +107,98 @@ L(return):
.p2align 4
L(less16):
- adds limit, limit, 8
- b.lo L(less8) //lo:<
+ add src1end, src1, limit
+ add src2end, src2, limit
+ tbz limit, 3, L(less8)
ldr data1, [src1]
ldr data2, [src2]
- /* equal 8 optimized */
- ccmp data1, data2, 0, ne
- b.ne L(return)
-
- ldr data1, [src1, limit]
- ldr data2, [src2, limit]
- b L(return)
+ ldr data3, [src1end, -8]
+ ldr data4, [src2end, -8]
+ b L(return2)
.p2align 4
L(less8):
- adds limit, limit, 4
- b.lo L(less4)
+ tbz limit, 2, L(less4)
ldr data1w, [src1]
ldr data2w, [src2]
- ccmp data1w, data2w, 0, ne
- b.ne L(return)
- ldr data1w, [src1, limit]
- ldr data2w, [src2, limit]
- b L(return)
+ ldr data3w, [src1end, -4]
+ ldr data4w, [src2end, -4]
+ b L(return2)
- .p2align 4
L(less4):
- adds limit, limit, 4
- b.eq L(ret_0)
-
-L(byte_loop):
- ldrb data1w, [src1], 1
- ldrb data2w, [src2], 1
- subs limit, limit, 1
- ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
- b.eq L(byte_loop)
+ tbz limit, 1, L(less2)
+ ldrh data1w, [src1]
+ ldrh data2w, [src2]
+ cmp data1w, data2w
+ b.ne L(return)
+L(less2):
+ mov result, 0
+ tbz limit, 0, L(return_zero)
+ ldrb data1w, [src1end, -1]
+ ldrb data2w, [src2end, -1]
sub result, data1w, data2w
+L(return_zero):
ret
-L(ret_0):
- mov result, 0
+
+L(loop_align):
+ ldp data1, data3, [src1, 16]
+ ldp data2, data4, [src2, 16]
+ cmp data1, data2
+ ccmp data3, data4, 0, eq
+ b.ne L(return2)
+
+ /* Align src2 and adjust src1, src2 and limit. */
+ and tmp, src2, 15
+ sub tmp, tmp, 16
+ sub src2, src2, tmp
+ add limit, limit, tmp
+ sub src1, src1, tmp
+ sub limit, limit, 64 + 16
+
+ .p2align 4
+L(loop64):
+ ldr q0, [src1, 16]
+ ldr q1, [src2, 16]
+ subs limit, limit, 64
+ ldr q2, [src1, 32]
+ ldr q3, [src2, 32]
+ eor v0.16b, v0.16b, v1.16b
+ eor v1.16b, v2.16b, v3.16b
+ ldr q2, [src1, 48]
+ ldr q3, [src2, 48]
+ umaxp v0.16b, v0.16b, v1.16b
+ ldr q4, [src1, 64]!
+ ldr q5, [src2, 64]!
+ eor v1.16b, v2.16b, v3.16b
+ eor v2.16b, v4.16b, v5.16b
+ umaxp v1.16b, v1.16b, v2.16b
+ umaxp v0.16b, v0.16b, v1.16b
+ umaxp v0.16b, v0.16b, v0.16b
+ fmov tmp, d0
+ ccmp tmp, 0, 0, hi
+ b.eq L(loop64)
+
+ /* If equal, process last 1-64 bytes using scalar loop. */
+ add limit, limit, 64 + 16
+ cbz tmp, L(last64)
+
+ /* Determine the 8-byte aligned offset of the first difference. */
+#ifdef __AARCH64EB__
+ rev16 tmp, tmp
+#endif
+ rev tmp, tmp
+ clz tmp, tmp
+ bic tmp, tmp, 7
+ sub tmp, tmp, 48
+ ldr data1, [src1, tmp]
+ ldr data2, [src2, tmp]
+#ifndef __AARCH64EB__
+ rev data1, data1
+ rev data2, data2
+#endif
+ mov result, 1
+ cmp data1, data2
+ cneg result, result, lo
ret
END (memcmp)